diff --git a/import-automation/executor/Dockerfile b/import-automation/executor/Dockerfile
index b473cea7fc..c62d6e043b 100644
--- a/import-automation/executor/Dockerfile
+++ b/import-automation/executor/Dockerfile
@@ -23,13 +23,8 @@ ENV JAVA_HOME=/usr/local/openjdk-17
 COPY --from=openjdk:17-slim $JAVA_HOME $JAVA_HOME
 ENV PATH="${JAVA_HOME}/bin:${PATH}"
 
-WORKDIR /workspace
-
-ADD requirements.txt /workspace/requirements.txt
-RUN pip install -r /workspace/requirements.txt
-
 RUN git clone https://github.com/datacommonsorg/data.git
-RUN wget https://github.com/datacommonsorg/import/releases/download/0.1-alpha.1k/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar
-COPY app/. /workspace/app/
-
-CMD gunicorn --timeout 0 --workers 5 -b :$PORT app.main:FLASK_APP
+WORKDIR /data/import-automation/executor
+RUN wget https://storage.googleapis.com/datacommons_public/import_tools/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar
+RUN pip install -r requirements.txt
+ENTRYPOINT ["python", "main.py"]
diff --git a/import-automation/executor/app/configs.py b/import-automation/executor/app/configs.py
index 60e615a997..154b559ac8 100644
--- a/import-automation/executor/app/configs.py
+++ b/import-automation/executor/app/configs.py
@@ -84,6 +84,8 @@ class ExecutorConfig:
     dashboard_oauth_client_id: str = ''
     # Oauth Client ID used to authenticate with the proxy.
     importer_oauth_client_id: str = ''
+    # URL for the import executor container image.
+    importer_docker_image: str = 'gcr.io/datcom-ci/dc-import-executor:stable'
     # Access token of the account used to authenticate with GitHub. This is not
     # the account password. See
     # https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token.
@@ -105,12 +107,18 @@ class ExecutorConfig:
     requirements_filename: str = 'requirements.txt'
     # ID of the location where Cloud Scheduler is hosted.
     scheduler_location: str = 'us-central1'
+    # Location of the local git data repo.
+    local_repo_dir: str = '/data'
     # Maximum time a user script can run for in seconds.
     user_script_timeout: float = 3600
     # Arguments for the user script
     user_script_args: List[str] = ()
     # Environment variables for the user script
     user_script_env: dict = None
+    # Invoke validations before upload.
+    invoke_import_validation: bool = False
+    # Import validation config file.
+    validation_config_file: str = 'tools/import_validation/validation_config.json'
     # Maximum time venv creation can take in seconds.
     venv_create_timeout: float = 3600
     # Maximum time downloading a file can take in seconds.
@@ -121,8 +129,10 @@ class ExecutorConfig:
     email_account: str = ''
     # The corresponding password, app password, or access token.
     email_token: str = ''
-    # Disbale email alert notifications.
+    # Disable email alert notifications.
     disable_email_notifications: bool = False
+    # Skip uploading the data to GCS (for local testing).
+    skip_gcs_upload: bool = False
     # Maximum time a blocking call to the importer to
     # perform an import can take in seconds.
     importer_import_timeout: float = 20 * 60
@@ -130,8 +140,8 @@ class ExecutorConfig:
     # delete an import can take in seconds.
     importer_delete_timeout: float = 10 * 60
     # Executor type depends on where the executor runs
-    # Suppports one of: "GKE", "GAE"
-    executor_type: str = 'GAE'
+    # Suppports one of: "GKE", "GAE", "CLOUD_RUN"
+    executor_type: str = 'CLOUD_RUN'
 
     def get_data_refresh_config(self):
         """Returns the config used for Cloud Scheduler data refresh jobs."""
diff --git a/import-automation/executor/app/executor/cloud_run.py b/import-automation/executor/app/executor/cloud_run.py
index 3895a4ace2..0452d3cd21 100644
--- a/import-automation/executor/app/executor/cloud_run.py
+++ b/import-automation/executor/app/executor/cloud_run.py
@@ -24,15 +24,12 @@
 from absl import logging
 from google.api_core.exceptions import NotFound
 from google.cloud import run_v2
+from google.protobuf import duration_pb2
 
 
-def create_or_update_cloud_run_job(
-    project_id: str,
-    location: str,
-    job_id: str,
-    image: str,
-    env_vars: dict,
-) -> run_v2.Job:
+def create_or_update_cloud_run_job(project_id: str, location: str, job_id: str,
+                                   image: str, env_vars: dict, args: list,
+                                   resources: dict, timeout: int) -> run_v2.Job:
     """Creates a new cloud run job or updates an existing one.
 
   If the jobs exists, the container is updated with new image and environment
@@ -45,6 +42,9 @@ def create_or_update_cloud_run_job(
     job_id: Name of the job
     image: Container image URL such as 'gcr.io/your-project/your-image:latest'
     env_vars: dict of environment variables as {'VAR': '<value>'}
+    args: list of command line arguments
+    resources: cpu/memory resources
+    timeout: duration in seconds
 
   Returns:
     Job created as a dict.
@@ -59,17 +59,23 @@ def create_or_update_cloud_run_job(
     for var, value in env_vars.items():
         env.append(run_v2.EnvVar(name=var, value=value))
 
-    container = run_v2.Container(image=image, env=env)
-    exe_template = run_v2.ExecutionTemplate(template=run_v2.TaskTemplate(
-        containers=[container]))
+    res = run_v2.types.ResourceRequirements(limits=resources)
+    container = run_v2.Container(image=image, env=env, resources=res, args=args)
+    # Labels allow filtering of automated import cloud run jobs, used in log-based metrics.
+    exe_template = run_v2.ExecutionTemplate(
+        labels={"datacommons_cloud_run_job_type": "auto_import_job"},
+        template=run_v2.TaskTemplate(
+            containers=[container],
+            max_retries=2,
+            timeout=duration_pb2.Duration(seconds=timeout)))
     new_job = run_v2.Job(template=exe_template)
-    logging.info(f"Creating job {job_name}: {new_job}")
+    logging.info(f"Creating job: {job_name}")
 
     # Look for existing job to update
     job = None
     try:
         job = client.get_job(request=run_v2.GetJobRequest(name=job_name))
-        logging.info(f"Found existing job {job_name}: {job}")
+        logging.info(f"Found existing job: {job_name}")
     except NotFound:
         logging.info(f"No existing job, creating new job: {job_name}")
 
@@ -85,11 +91,11 @@ def create_or_update_cloud_run_job(
         # Update existing Cloud Run job
         # Overrides container settings including image, env
         job.template.template.containers = new_job.template.template.containers
-        logging.info(f"Updating job {job_name}: {job}")
+        logging.info(f"Updating job: {job_name}")
         update_request = run_v2.UpdateJobRequest(job=job)
         update_operation = client.update_job(request=update_request)
         result = update_operation.result()  # Blocks until update completes
-        logging.info(f"Job updated {job_name}: {result}")
+        logging.info(f"Job updated: {job_name}")
     return result
 
 
diff --git a/import-automation/executor/app/executor/cloud_run_simple_import.py b/import-automation/executor/app/executor/cloud_run_simple_import.py
index 380e3a3b19..51736d4f24 100644
--- a/import-automation/executor/app/executor/cloud_run_simple_import.py
+++ b/import-automation/executor/app/executor/cloud_run_simple_import.py
@@ -188,8 +188,11 @@ def cloud_run_simple_import_job(
     logging.info(
         f'Setting up simple import cloud run {project_id}:{job_id} for'
         f' {config_file} with output: {gcs_output_dir}, env: {env_vars}')
+    resources = {}
+    args = []
     job = cloud_run.create_or_update_cloud_run_job(project_id, location, job_id,
-                                                   image, env_vars)
+                                                   image, env_vars, args,
+                                                   resources)
     if not job:
         logging.error(
             f'Failed to setup cloud run job {job_id} for {config_file}')
diff --git a/import-automation/executor/app/executor/cloud_scheduler.py b/import-automation/executor/app/executor/cloud_scheduler.py
index e89ec592b8..68e70afd8f 100644
--- a/import-automation/executor/app/executor/cloud_scheduler.py
+++ b/import-automation/executor/app/executor/cloud_scheduler.py
@@ -26,6 +26,7 @@
 from google.protobuf import json_format
 from google.api_core.exceptions import AlreadyExists, NotFound
 
+CLOUD_RUN_SERVICE_ACCOUNT = os.getenv('CLOUD_SCHEDULER_CALLER_SA')
 GKE_SERVICE_DOMAIN = os.getenv('GKE_SERVICE_DOMAIN',
                                'importautomation.datacommons.org')
 GKE_CALLER_SERVICE_ACCOUNT = os.getenv('CLOUD_SCHEDULER_CALLER_SA')
@@ -50,15 +51,38 @@ def _base_job_request(absolute_import_name, schedule: str):
             # 30m is the max allowed deadline
             'seconds': 60 * 30
         }
-        # <'http_request'|'appengine_job_request'>: {...}
+        # <'gke_job_request'|'appengine_job_request'|'cloud_run_job_request'>: {...}
     }
 
 
-def http_job_request(absolute_import_name,
-                     schedule,
-                     json_encoded_job_body: str,
-                     gke_caller_service_account: str = "",
-                     gke_oauth_audience: str = "") -> Dict:
+def cloud_run_job_request(absolute_import_name, schedule,
+                          cloud_run_job_url: str,
+                          cloud_run_service_account: str) -> Dict:
+    """Cloud Scheduler request that targets jobs in CLOUD_RUN."""
+    json_encoded_job_body = json.dumps({}).encode("utf-8")
+    job = _base_job_request(absolute_import_name, schedule)
+    job_name = absolute_import_name.split(':')[1]
+    job['name'] = f'{job_name}'
+    job['http_target'] = {
+        'uri': f'https://{cloud_run_job_url}',
+        'http_method': 'POST',
+        'headers': {
+            'Content-Type': 'application/json',
+        },
+        'body': json_encoded_job_body,
+        'oauth_token': {
+            'service_account_email': f'{cloud_run_service_account}',
+            'scope': 'https://www.googleapis.com/auth/cloud-platform'
+        }
+    }
+    return job
+
+
+def gke_job_request(absolute_import_name,
+                    schedule,
+                    json_encoded_job_body: str,
+                    gke_caller_service_account: str = "",
+                    gke_oauth_audience: str = "") -> Dict:
     """Cloud Scheduler request that targets executors launched in GKE."""
 
     # If the service account and oauth audience are provided as
diff --git a/import-automation/executor/app/executor/import_executor.py b/import-automation/executor/app/executor/import_executor.py
index 8396f95465..33740da080 100644
--- a/import-automation/executor/app/executor/import_executor.py
+++ b/import-automation/executor/app/executor/import_executor.py
@@ -17,15 +17,29 @@
 """
 
 import dataclasses
+import glob
 import json
 import logging
 import os
+import sys
 import subprocess
 import tempfile
 import time
 import traceback
 from typing import Callable, Dict, Iterable, List, Optional, Tuple
 
+REPO_DIR = os.path.dirname(
+    os.path.dirname(
+        os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
+sys.path.append(os.path.join(REPO_DIR, 'tools', 'import_differ'))
+sys.path.append(os.path.join(REPO_DIR, 'tools', 'import_validation'))
+sys.path.append(os.path.join(REPO_DIR, 'util'))
+
+import file_util
+
+from import_differ import ImportDiffer
+from import_validation import ImportValidation
 from app import configs
 from app import utils
 from app.executor import cloud_run_simple_import
@@ -34,6 +48,7 @@
 from app.service import file_uploader
 from app.service import github_api
 from app.service import import_service
+from google.cloud import storage
 
 # Email address for status messages.
 _DEBUG_EMAIL_ADDR = 'datacommons-debug+imports@google.com'
@@ -240,7 +255,6 @@ def _execute_imports_on_commit_helper(
         with tempfile.TemporaryDirectory() as tmpdir:
             repo_dir = self.github.download_repo(
                 tmpdir, commit_sha, self.config.repo_download_timeout)
-
             logging.info(f'Downloaded repo: {repo_dir}')
 
             imports_to_execute = import_target.find_imports_to_execute(
@@ -318,6 +332,100 @@ def _import_one(
                 )
             raise exc
 
+    def _invoke_import_validation(self, repo_dir: str, relative_import_dir: str,
+                                  absolute_import_dir: str,
+                                  import_spec: dict) -> None:
+        """ 
+        Performs validations on import data.
+        """
+        import_inputs = import_spec.get('import_inputs', [])
+        for import_input in import_inputs:
+            mcf_path = import_input['node_mcf']
+            if not mcf_path:
+                # TODO: Generate node mcf using dc-import tool
+                logging.error(
+                    'Empty node_mcf in manifest, skipping validation.')
+            current_data_path = os.path.join(absolute_import_dir, mcf_path)
+            previous_data_path = os.path.join(absolute_import_dir,
+                                              'previous_data.mcf')
+            summary_stats = os.path.join(absolute_import_dir,
+                                         'summary_report.csv')
+            validation_output_path = os.path.join(absolute_import_dir,
+                                                  'validation')
+            config_file = import_spec.get('validation_config_file', '')
+            if config_file:
+                config_file_path = os.path.join(absolute_import_dir,
+                                                config_file)
+            else:
+                config_file_path = os.path.join(
+                    repo_dir, self.config.validation_config_file)
+            logging.info(f'Validation config file: {config_file_path}')
+
+            # Download previous import data.
+            bucket = storage.Client(self.config.gcs_project_id).bucket(
+                self.config.storage_prod_bucket_name)
+            folder = relative_import_dir + '/' + import_spec['import_name'] + '/'
+            blob = bucket.blob(folder + 'latest_version.txt')
+            if not blob:
+                logging.error(
+                    f'Not able to download latest_version.txt from {folder}, skipping validation.'
+                )
+                return
+            latest_version = blob.download_as_text()
+            blob = bucket.blob(folder + latest_version + '/' + mcf_path)
+            if not blob:
+                logging.error(
+                    f'Not able to download previous import from {latest_version}, skipping validation.'
+                )
+                return
+            # blob.download_to_filename(previous_data_path)
+
+            # Invoke differ script.
+            differ = ImportDiffer(current_data_path, previous_data_path,
+                                  validation_output_path)
+            differ.run_differ()
+
+            # Invoke validation script.
+            validation_output = os.path.join(validation_output_path,
+                                             'validation_output.csv')
+            differ_output = os.path.join(validation_output_path,
+                                         'point_analysis_summary.csv')
+            validation = ImportValidation(config_file_path, differ_output,
+                                          summary_stats, validation_output)
+            validation.run_validations()
+
+    def _invoke_import_job(self, absolute_import_dir: str, import_spec: dict,
+                           version: str, interpreter_path: str,
+                           process: subprocess.CompletedProcess) -> None:
+        script_paths = import_spec.get('scripts')
+        for path in script_paths:
+            script_path = os.path.join(absolute_import_dir, path)
+            simple_job = cloud_run_simple_import.get_simple_import_job_id(
+                import_spec, script_path)
+            if simple_job:
+                # Running simple import as cloud run job.
+                cloud_run_simple_import.cloud_run_simple_import_job(
+                    import_spec=import_spec,
+                    config_file=script_path,
+                    env=self.config.user_script_env,
+                    version=version,
+                    image=import_spec.get('image'),
+                )
+            else:
+                # Run import script locally.
+                script_interpreter = _get_script_interpreter(
+                    script_path, interpreter_path)
+                process = _run_user_script(
+                    interpreter_path=script_interpreter,
+                    script_path=script_path,
+                    timeout=self.config.user_script_timeout,
+                    args=self.config.user_script_args,
+                    cwd=absolute_import_dir,
+                    env=self.config.user_script_env,
+                )
+                _log_process(process=process)
+                process.check_returncode()
+
     def _import_one_helper(
         self,
         repo_dir: str,
@@ -341,7 +449,8 @@ def _import_one_helper(
             requirements_path = os.path.join(absolute_import_dir,
                                              self.config.requirements_filename)
             central_requirements_path = os.path.join(
-                repo_dir, self.config.requirements_filename)
+                repo_dir, 'import-automation', 'executor',
+                self.config.requirements_filename)
             interpreter_path, process = _create_venv(
                 (central_requirements_path, requirements_path),
                 tmpdir,
@@ -351,35 +460,23 @@ def _import_one_helper(
             _log_process(process=process)
             process.check_returncode()
 
-            script_paths = import_spec.get('scripts')
-            for path in script_paths:
-                script_path = os.path.join(absolute_import_dir, path)
-                simple_job = cloud_run_simple_import.get_simple_import_job_id(
-                    import_spec, script_path)
-                if simple_job:
-                    # Running simple import as cloud run job.
-                    cloud_run_simple_import.cloud_run_simple_import_job(
-                        import_spec=import_spec,
-                        config_file=script_path,
-                        env=self.config.user_script_env,
-                        version=version,
-                        image=import_spec.get('image'),
-                    )
-                else:
-                    # Run import script locally.
-                    script_interpreter = _get_script_interpreter(
-                        script_path, interpreter_path)
-                    process = _run_user_script(
-                        interpreter_path=script_interpreter,
-                        script_path=script_path,
-                        timeout=self.config.user_script_timeout,
-                        args=self.config.user_script_args,
-                        cwd=absolute_import_dir,
-                        env=self.config.user_script_env,
-                        name=import_name,
-                    )
-                    _log_process(process=process)
-                    process.check_returncode()
+            self._invoke_import_job(absolute_import_dir=absolute_import_dir,
+                                    import_spec=import_spec,
+                                    version=version,
+                                    interpreter_path=interpreter_path,
+                                    process=process)
+
+            if self.config.invoke_import_validation:
+                logging.info("Invoking import validations")
+                self._invoke_import_validation(
+                    repo_dir=repo_dir,
+                    relative_import_dir=relative_import_dir,
+                    absolute_import_dir=absolute_import_dir,
+                    import_spec=import_spec)
+
+        if self.config.skip_gcs_upload:
+            logging.info("Skipping GCS upload")
+            return
 
         inputs = self._upload_import_inputs(
             import_dir=absolute_import_dir,
@@ -388,6 +485,14 @@ def _import_one_helper(
             import_spec=import_spec,
         )
 
+        validation_output_path = os.path.join(absolute_import_dir, 'validation')
+        for filepath in glob.iglob(f'{validation_output_path}/*.csv'):
+            dest = f'{relative_import_dir}/{import_name}/{version}/validation/{os.path.basename(filepath)}'
+            self.uploader.upload_file(
+                src=filepath,
+                dest=dest,
+            )
+
         if self.importer:
             self.importer.delete_previous_output(relative_import_dir,
                                                  import_spec)
@@ -438,13 +543,32 @@ def _upload_import_inputs(
         for import_input in import_inputs:
             for input_type in self.config.import_input_types:
                 path = import_input.get(input_type)
-                if path:
-                    dest = f'{output_dir}/{version}/{os.path.basename(path)}'
-                    self._upload_file_helper(
-                        src=os.path.join(import_dir, path),
-                        dest=dest,
-                    )
-                    setattr(uploaded, input_type, dest)
+                if not path:
+                    continue
+                for file in file_util.file_get_matching(
+                        os.path.join(import_dir, path)):
+                    if file:
+                        dest = f'{output_dir}/{version}/{os.path.basename(file)}'
+                        self._upload_file_helper(
+                            src=file,
+                            dest=dest,
+                        )
+                uploaded_dest = f'{output_dir}/{version}/{os.path.basename(path)}'
+                setattr(uploaded, input_type, uploaded_dest)
+
+        # Upload any files downloaded from source
+        source_files = [
+            os.path.join(import_dir, file)
+            for file in import_spec.get('source_files', [])
+        ]
+        source_files = file_util.file_get_matching(source_files)
+        for file in source_files:
+            dest = f'{output_dir}/{version}/source_files/{os.path.basename(file)}'
+            self._upload_file_helper(
+                src=file,
+                dest=dest,
+            )
+
         self.uploader.upload_string(
             version,
             os.path.join(output_dir, self.config.storage_version_filename))
@@ -467,15 +591,15 @@ def _import_metadata_mcf_helper(self, import_spec: dict) -> str:
 
         Args:
             import_spec: Specification of the import as a dict.
-        
+
         Returns:
             import_metadata_mcf node.
         """
         node = _IMPORT_METADATA_MCF_TEMPLATE.format_map({
             "import_name": import_spec.get('import_name'),
-            "last_data_refresh_date": _clean_date(utils.pacific_time())
+            "last_data_refresh_date": _clean_date(utils.utctime())
         })
-        next_data_refresh_date = utils.next_pacific_date(
+        next_data_refresh_date = utils.next_utc_date(
             import_spec.get('cron_schedule'))
         if next_data_refresh_date:
             node += f'nextDataRefreshDate: "{next_data_refresh_date}"\n'
diff --git a/import-automation/executor/app/executor/scheduler_job_manager.py b/import-automation/executor/app/executor/scheduler_job_manager.py
index e1f7e775e7..e22afb9e3e 100644
--- a/import-automation/executor/app/executor/scheduler_job_manager.py
+++ b/import-automation/executor/app/executor/scheduler_job_manager.py
@@ -33,6 +33,7 @@
 import traceback
 import tempfile
 from typing import Dict
+import cloud_run
 
 from app import configs
 from app.service import github_api
@@ -40,6 +41,9 @@
 from app.executor import import_executor
 from app.executor import cloud_scheduler
 
+_GKE_SERVICE_ACCOUNT_KEY: str = 'gke_service_account'
+_GKE_OAUTH_AUDIENCE_KEY: str = 'gke_oauth_audience'
+
 
 def schedule_on_commit(github: github_api.GitHubRepoAPI,
                        config: configs.ExecutorConfig, commit_sha: str):
@@ -66,18 +70,13 @@ def schedule_on_commit(github: github_api.GitHubRepoAPI,
 
         scheduled = []
         for relative_dir, spec in imports_to_execute:
-            schedule = spec.get('cron_schedule')
-            if not schedule:
-                manifest_path = os.path.join(relative_dir,
-                                             config.manifest_filename)
-                raise KeyError(f'cron_schedule not found in {manifest_path}')
             try:
                 absolute_import_name = import_target.get_absolute_import_name(
                     relative_dir, spec['import_name'])
                 logging.info('Scheduling a data update job for %s',
                              absolute_import_name)
-                job = _create_or_update_import_schedule(absolute_import_name,
-                                                        schedule, config)
+                job = create_or_update_import_schedule(absolute_import_name,
+                                                       spec, config, {})
                 scheduled.append(job)
             except Exception:
                 raise import_executor.ExecutionError(
@@ -87,26 +86,70 @@ def schedule_on_commit(github: github_api.GitHubRepoAPI,
                                                'No issues')
 
 
-def _create_or_update_import_schedule(absolute_import_name, schedule: str,
-                                      config: configs.ExecutorConfig):
+def create_or_update_import_schedule(absolute_import_name: str,
+                                     import_spec: dict,
+                                     config: configs.ExecutorConfig,
+                                     scheduler_config_dict: Dict):
     """Create/Update the import schedule for 1 import."""
-    # Note: this is the content of what is passed to /update API
-    # inside each cronjob http calls.
-    json_encoded_job_body = json.dumps({
-        'absolute_import_name': absolute_import_name,
-        'configs': config.get_data_refresh_config()
-    }).encode()
+    schedule = import_spec.get('cron_schedule')
+    if not schedule:
+        raise KeyError(
+            f'cron_schedule not found in manifest for {absolute_import_name}')
+    resources = {"cpu": "2", "memory": "4G"}  # default resources.
+    if 'resource_limits' in import_spec:
+        resources.update(import_spec['resource_limits'])
+    timeout = config.user_script_timeout
+    if 'user_script_timeout' in import_spec:
+        timeout = import_spec['user_script_timeout']
 
     if config.executor_type == "GKE":
-        req = cloud_scheduler.http_job_request(absolute_import_name, schedule,
-                                               json_encoded_job_body)
+        # Note: this is the content of what is passed to /update API
+        # inside each cronjob http calls.
+        json_encoded_job_body = json.dumps({
+            'absolute_import_name': absolute_import_name,
+            'configs': config.get_data_refresh_config()
+        }).encode('utf-8')
+        # Before proceeding, ensure that the configs read from GCS have the expected fields.
+        assert _GKE_SERVICE_ACCOUNT_KEY in scheduler_config_dict
+        assert _GKE_OAUTH_AUDIENCE_KEY in scheduler_config_dict
+        service_account_key = scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY]
+        oauth_audience_key = scheduler_config_dict[_GKE_OAUTH_AUDIENCE_KEY]
+        req = cloud_scheduler.gke_job_request(absolute_import_name, schedule,
+                                              json_encoded_job_body,
+                                              service_account_key,
+                                              oauth_audience_key)
     elif config.executor_type == "GAE":
+        json_encoded_job_body = json.dumps({
+            'absolute_import_name': absolute_import_name,
+            'configs': config.get_data_refresh_config()
+        }).encode('utf-8')
         req = cloud_scheduler.appengine_job_request(absolute_import_name,
                                                     schedule,
                                                     json_encoded_job_body)
+    elif config.executor_type == "CLOUD_RUN":
+        docker_image = config.importer_docker_image
+        job_name = absolute_import_name.split(':')[1]
+
+        json_encoded_config = json.dumps(config.get_data_refresh_config())
+        args = [
+            f'--import_name={absolute_import_name}',
+            f'--import_config={json_encoded_config}'
+        ]
+        env_vars = {}
+        job = cloud_run.create_or_update_cloud_run_job(
+            config.gcp_project_id, config.scheduler_location, job_name,
+            docker_image, env_vars, args, resources, timeout)
+        job_id = job.name.rsplit('/', 1)[1]
+        if not job:
+            logging.error(
+                f'Failed to setup cloud run job for {absolute_import_name}')
+        cloud_run_job_url = f'{config.scheduler_location}-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/{config.gcp_project_id}/jobs/{job_id}:run'
+        req = cloud_scheduler.cloud_run_job_request(
+            absolute_import_name, schedule, cloud_run_job_url,
+            scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY])
     else:
         raise Exception(
-            "Invalid executor_type %s, expects one of ('GKE', 'GAE')",
+            "Invalid executor_type %s, expects one of ('GKE', 'GAE', 'CLOUD_RUN')",
             config.executor_type)
 
     return cloud_scheduler.create_or_update_job(config.gcp_project_id,
diff --git a/import-automation/executor/app/utils.py b/import-automation/executor/app/utils.py
index 6d320c62cb..fe1a04d10b 100644
--- a/import-automation/executor/app/utils.py
+++ b/import-automation/executor/app/utils.py
@@ -40,9 +40,9 @@ def pacific_time():
     return datetime.datetime.now(pytz.timezone(_PACIFIC_TIME)).isoformat()
 
 
-def next_pacific_date(cron_expression: str, from_time: str = None) -> str:
-    """Returns the next date from today in ISO8601 with timezone
-    America/Los_Angeles, given a cron schedule.
+def next_utc_date(cron_expression: str, from_time: str = None) -> str:
+    """Returns the next date from today in ISO8601 with timezone UTC+0,
+    given a cron schedule.
 
     Args:
         cron_expression: Expression for cron schedule.
@@ -53,7 +53,7 @@ def next_pacific_date(cron_expression: str, from_time: str = None) -> str:
     """
     try:
         if not from_time:
-            from_time = datetime.datetime.now(pytz.timezone(_PACIFIC_TIME))
+            from_time = datetime.datetime.now(datetime.timezone.utc)
         iter = croniter(cron_expression, from_time)
         return iter.get_next(datetime.datetime).date().isoformat()
     except Exception as e:
diff --git a/import-automation/executor/cloudbuild.yaml b/import-automation/executor/cloudbuild.yaml
new file mode 100644
index 0000000000..0d185b7a76
--- /dev/null
+++ b/import-automation/executor/cloudbuild.yaml
@@ -0,0 +1,36 @@
+# Builds the docker image of import executor, verifies using integration test,
+# and pushes it to artifact registry.
+# 
+# Run it using:
+# gcloud builds submit --config=cloudbuild.yaml --substitutions=_DOCKER_IMAGE="us-docker.pkg.dev/datcom-ci/gcr.io/dc-import-executor" .
+
+steps:
+  # Docker Build
+  - name: 'gcr.io/cloud-builders/docker'
+    args: ['build', '-t', '${_DOCKER_IMAGE}:${COMMIT_SHA}', '-t', '${_DOCKER_IMAGE}:latest', '.']
+    dir: 'import-automation/executor'
+
+  # Docker push to Google Artifact Registry
+  - name: 'gcr.io/cloud-builders/docker'
+    args: ['push', '${_DOCKER_IMAGE}', '--all-tags']
+
+  # Install dependencies
+  - name: python:3.11.11
+    entrypoint: pip
+    args: ['install', '-r', 'requirements.txt', '--user']
+    dir: 'import-automation/executor'
+
+  # Run import executor integration test
+  - name: python:3.11.11
+    entrypoint: python
+    args: ['verify_import.py']
+    dir: 'import-automation/executor'
+
+  # Tag image as stable and push
+  - name: 'gcr.io/cloud-builders/docker'
+    entrypoint: 'bash'
+    args:
+    - '-c'
+    - |
+      docker tag ${_DOCKER_IMAGE}:${COMMIT_SHA} ${_DOCKER_IMAGE}:stable \
+      && docker push ${_DOCKER_IMAGE}:stable
diff --git a/import-automation/executor/main.py b/import-automation/executor/main.py
new file mode 100644
index 0000000000..696c27f25c
--- /dev/null
+++ b/import-automation/executor/main.py
@@ -0,0 +1,91 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import executor entrypoint.
+"""
+import logging
+import json
+import os
+import time
+
+from absl import flags
+from absl import app
+
+from app import configs
+from app.executor import import_executor
+from app.service import file_uploader
+from app.service import github_api
+from app.service import email_notifier
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('import_name', '', 'Absoluate import name.')
+flags.DEFINE_string('import_config', '', 'Import executor configuration.')
+
+CLOUD_RUN_JOB_NAME = os.getenv("CLOUD_RUN_JOB")
+# The `log_type` label helps filter log lines, which is useful for creating
+# log-based metrics.  Each log type has a similar set of fields for easier parsing.
+LOG_TYPE_LABEL = "log_type"
+# log_type for capturing status of auto import cloud run jobs.
+# Required fields - log_type, message, status, latency_secs.
+AUTO_IMPORT_JOB_STATUS_LOG_TYPE = "auto-import-job-status"
+
+
+def scheduled_updates(absolute_import_name: str, import_config: str):
+    """
+    Invokes import update workflow.
+    """
+    start_time = time.time()
+    logging.info(absolute_import_name)
+    cfg = json.loads(import_config)
+    config = configs.ExecutorConfig(**cfg)
+    executor = import_executor.ImportExecutor(
+        uploader=file_uploader.GCSFileUploader(
+            project_id=config.gcs_project_id,
+            bucket_name=config.storage_prod_bucket_name),
+        github=github_api.GitHubRepoAPI(
+            repo_owner_username=config.github_repo_owner_username,
+            repo_name=config.github_repo_name,
+            auth_username=config.github_auth_username,
+            auth_access_token=config.github_auth_access_token),
+        config=config,
+        notifier=email_notifier.EmailNotifier(config.email_account,
+                                              config.email_token),
+        local_repo_dir=config.local_repo_dir)
+    result = executor.execute_imports_on_update(absolute_import_name)
+    logging.info(result)
+    elapsed_time_secs = int(time.time() - start_time)
+    message = (f"Cloud Run Job [{CLOUD_RUN_JOB_NAME}] completed with status= "
+               f"[{result.status}] in [{elapsed_time_secs}] seconds.)")
+    # With Python logging lib, json is interpreted as text (populates textPayload field).
+    # Using print to populate json as structured logs (populate jsonPayload field).
+    # Ref: https://cloud.google.com/functions/docs/monitoring/logging#writing_structured_logs
+    print(
+        json.dumps({
+            LOG_TYPE_LABEL: AUTO_IMPORT_JOB_STATUS_LOG_TYPE,
+            "message": message,
+            "severity": "INFO" if result.status == 'succeeded' else "ERROR",
+            "status": result.status,
+            "latency_secs": elapsed_time_secs,
+        }))
+    if result.status == 'failed':
+        return 1
+    return 0
+
+
+def main(_):
+    return scheduled_updates(FLAGS.import_name, FLAGS.import_config)
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/import-automation/executor/requirements.txt b/import-automation/executor/requirements.txt
index 956b49e547..92692ffb3a 100644
--- a/import-automation/executor/requirements.txt
+++ b/import-automation/executor/requirements.txt
@@ -1,14 +1,40 @@
-requests
-protobuf
+# Requirements for Python scripts in this repo that have automation enabled!
+
+absl-py
+arcgis2geojson
+beautifulsoup4
+chardet
+croniter
+dataclasses==0.6
+datacommons
+frozendict
+func-timeout==4.3.5
+geojson==2.5.0
 google-auth
-google-cloud-logging
+google-cloud-bigquery
+google-cloud-datastore
 google-cloud-run
 google-cloud-storage
-google-cloud-datastore
-google-cloud-scheduler==2.10.0
-gspread
-flask
+google-cloud-logging
+google-cloud-scheduler
+gspread==5.12.0
 gunicorn
+lxml==4.9.1
+numpy==1.26.4
+openpyxl>=3.1.0
+pandas
+protobuf
+psutil
+pylint
+pytest
 pytz
-absl-py
-croniter
+ratelimit
+requests==2.27.1
+requests_cache
+retry==0.9.2
+shapely==1.8.5
+urllib3==1.26.8
+xarray==0.19.0
+xlrd
+xlsxwriter==3.2.0
+zipp
diff --git a/import-automation/executor/schedule_update_import.py b/import-automation/executor/schedule_update_import.py
index b815faca52..9b1253af03 100644
--- a/import-automation/executor/schedule_update_import.py
+++ b/import-automation/executor/schedule_update_import.py
@@ -25,6 +25,7 @@
 from app.executor import import_target
 from app.executor import import_executor
 from app.executor import cloud_scheduler
+from app.executor import scheduler_job_manager
 from app.executor import validation
 from app.service import email_notifier
 from app.service import file_uploader
@@ -32,8 +33,6 @@
 from google.cloud import storage
 
 _CONFIG_OVERRIDE_FILE: str = 'config_override.json'
-_GKE_SERVICE_ACCOUNT_KEY: str = 'gke_service_account'
-_GKE_OAUTH_AUDIENCE_KEY: str = 'gke_oauth_audience'
 
 _FLAGS = flags.FLAGS
 
@@ -62,8 +61,8 @@
 logging.basicConfig(level=logging.INFO)
 
 
-def _get_cron_schedule(repo_dir: str, absolute_import_path: str,
-                       manifest_filename: str):
+def _get_import_spec(repo_dir: str, absolute_import_path: str,
+                     manifest_filename: str):
 
     # Retain the path to the import (ignoring the name of the import).
     path = absolute_import_path.split(":")[0]
@@ -79,7 +78,7 @@ def _get_cron_schedule(repo_dir: str, absolute_import_path: str,
 
     for spec in manifest['import_specifications']:
         if absolute_import_path.endswith(':' + spec['import_name']):
-            return spec['cron_schedule']
+            return spec
 
     # If we are here, the the import name was not found in the manifest.
     raise Exception(
@@ -240,34 +239,6 @@ def update(cfg: configs.ExecutorConfig,
     return executor.execute_imports_on_update(absolute_import_path)
 
 
-def schedule(cfg: configs.ExecutorConfig,
-             absolute_import_name: str,
-             repo_dir: str,
-             gke_service_account: str = "",
-             gke_oauth_audience: str = "") -> Dict:
-    # This is the content of what is passed to /update API
-    # inside each cronjob http calls from Cloud Scheduler.
-    json_encoded_job_body = json.dumps({
-        'absolute_import_name': absolute_import_name,
-        'configs': cfg.get_data_refresh_config()
-    }).encode("utf-8")
-
-    # Retrieve the cron schedule.
-    cron_schedule = _get_cron_schedule(repo_dir, absolute_import_name,
-                                       cfg.manifest_filename)
-
-    # Create an HTTP Job Request.
-    req = cloud_scheduler.http_job_request(
-        absolute_import_name,
-        cron_schedule,
-        json_encoded_job_body,
-        gke_caller_service_account=gke_service_account,
-        gke_oauth_audience=gke_oauth_audience)
-
-    return cloud_scheduler.create_or_update_job(cfg.gcp_project_id,
-                                                cfg.scheduler_location, req)
-
-
 def main(_):
     mode = _FLAGS.mode
     absolute_import_path = _FLAGS.absolute_import_path
@@ -335,19 +306,14 @@ def main(_):
         _print_fileupload_results(cfg, absolute_import_path)
 
     elif mode == 'schedule':
-        # Before proceeding, ensure that the configs read from GCS have the expected fields.
-        assert _GKE_SERVICE_ACCOUNT_KEY in scheduler_config_dict
-        assert _GKE_OAUTH_AUDIENCE_KEY in scheduler_config_dict
-
         logging.info("*************************************************")
         logging.info("***** Beginning Schedule Operation **************")
         logging.info("*************************************************")
-        res = schedule(
-            cfg,
-            absolute_import_path,
-            repo_dir,
-            gke_service_account=scheduler_config_dict[_GKE_SERVICE_ACCOUNT_KEY],
-            gke_oauth_audience=scheduler_config_dict[_GKE_OAUTH_AUDIENCE_KEY])
+        # Retrieve the cron schedule.
+        import_spec = _get_import_spec(repo_dir, absolute_import_path,
+                                       cfg.manifest_filename)
+        res = scheduler_job_manager.create_or_update_import_schedule(
+            absolute_import_path, import_spec, cfg, scheduler_config_dict)
         logging.info("*************************************************")
         logging.info("*********** Schedule Operation Complete. ********")
         logging.info("*************************************************")
diff --git a/import-automation/executor/test/cloud_scheduler_test.py b/import-automation/executor/test/cloud_scheduler_test.py
index 468249bedb..952ed3a1fb 100644
--- a/import-automation/executor/test/cloud_scheduler_test.py
+++ b/import-automation/executor/test/cloud_scheduler_test.py
@@ -60,15 +60,15 @@ def test_appengine_job_request(self):
         }
         assert DeepDiff(got, want) == {}
 
-    def test_http_job_request(self):
+    def test_gke_job_request(self):
         absolute_import_name = "scripts/preprocess:A"
         schedule = "0 5 * * *"
         json_encoded_job_body = '{"k":"v"}'
         cloud_scheduler.GKE_CALLER_SERVICE_ACCOUNT = 'account'
         cloud_scheduler.GKE_OAUTH_AUDIENCE = 'audience'
 
-        got = cloud_scheduler.http_job_request(absolute_import_name, schedule,
-                                               json_encoded_job_body)
+        got = cloud_scheduler.gke_job_request(absolute_import_name, schedule,
+                                              json_encoded_job_body)
         want = {
             'name': 'scripts_preprocess_A_GKE',
             'description': 'scripts/preprocess:A',
@@ -97,3 +97,41 @@ def test_http_job_request(self):
             }
         }
         assert DeepDiff(got, want) == {}
+
+    def test_cloud_run_job_request(self):
+        absolute_import_name = "scripts/preprocess:A"
+        schedule = "0 5 * * *"
+
+        cloud_run_service_account = 'service_account'
+        cloud_run_job_url = 'run.googleapis.com/run'
+        got = cloud_scheduler.cloud_run_job_request(absolute_import_name,
+                                                    schedule, cloud_run_job_url,
+                                                    cloud_run_service_account)
+        want = {
+            'name': 'A',
+            'description': 'scripts/preprocess:A',
+            'schedule': "0 5 * * *",
+            'time_zone': 'Etc/UTC',
+            'retry_config': {
+                'retry_count': 2,
+                'min_backoff_duration': {
+                    'seconds': 60 * 60
+                }
+            },
+            'attempt_deadline': {
+                'seconds': 60 * 30
+            },
+            'http_target': {
+                'uri': 'https://run.googleapis.com/run',
+                'http_method': 'POST',
+                'headers': {
+                    'Content-Type': 'application/json',
+                },
+                'body': b'{}',
+                'oauth_token': {
+                    'service_account_email': 'service_account',
+                    'scope': 'https://www.googleapis.com/auth/cloud-platform'
+                }
+            }
+        }
+        assert DeepDiff(got, want) == {}
diff --git a/import-automation/executor/test/utils_test.py b/import-automation/executor/test/utils_test.py
index cc29475398..55e9655e64 100644
--- a/import-automation/executor/test/utils_test.py
+++ b/import-automation/executor/test/utils_test.py
@@ -47,15 +47,14 @@ def test_pacific_time_to_datetime_then_back(self):
         time_datetime = datetime.datetime.fromisoformat(time_iso)
         self.assertEqual(time_iso, time_datetime.isoformat())
 
-    def test_next_pacific_date(self):
-        """Tests next_pacific_date."""
+    def test_next_utc_date(self):
+        """Tests next_utc_date."""
         # At 00:00 on Friday.
         cron_expression = '0 0 * * FRI'
         # Friday.
         from_time = datetime.datetime(2024, 12, 13)
-        self.assertEqual(
-            app.utils.next_pacific_date(cron_expression, from_time),
-            '2024-12-20')
+        self.assertEqual(app.utils.next_utc_date(cron_expression, from_time),
+                         '2024-12-20')
 
     def test_download_file(self):
         """Response does not have a Content-Disposition header."""
diff --git a/import-automation/executor/verify_import.py b/import-automation/executor/verify_import.py
new file mode 100644
index 0000000000..274c0a3006
--- /dev/null
+++ b/import-automation/executor/verify_import.py
@@ -0,0 +1,70 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Integration test for import executor image rollout.
+Runs a test import as a Cloud Run job and verifies output from GCS.
+"""
+import logging
+import os
+
+from app.executor import cloud_run
+from app.executor import file_io
+from test import utils
+
+_PROJECT_ID = 'datcom-ci'
+_JOB_ID = 'dc-import-prober'
+_LOCATION = 'us-central1'
+_GCS_BUCKET = 'datcom-ci-test'
+_IMPORT_NAME = 'scripts/us_fed/treasury_constant_maturity_rates:USFed_ConstantMaturityRates'
+
+
+def run_test():
+    logging.basicConfig(level=logging.INFO)
+    # Execute the cloud run job.
+    logging.info('Running cloud run job: %s', _JOB_ID)
+    job = cloud_run.execute_cloud_run_job(_PROJECT_ID, _LOCATION, _JOB_ID)
+    if not job:
+        logging.error('Failed to execute cloud run job: %s', _JOB_ID)
+        raise (AssertionError(f'Failed to execute cloud run job {_JOB_ID}'))
+    logging.info('Completed run: %s', _JOB_ID)
+
+    # Download output data from GCS.
+    gcs_path = 'gs://' + _GCS_BUCKET + '/' + _IMPORT_NAME.replace(':',
+                                                                  '/') + '/'
+    file_path = gcs_path + 'latest_version.txt'
+    logging.info('Downloading data from GCS: %s', file_path)
+    blob = file_io.file_get_gcs_blob(file_path, True)
+    if not blob:
+        logging.error('Failed to get data from: %s', file_path)
+        raise (AssertionError(f'Failed to get data from {file_path}'))
+    timestamp = blob.download_as_text()
+    file_path = gcs_path + timestamp + '/' + 'treasury_constant_maturity_rates.mcf'
+    logging.info('Downloading data from GCS: %s', file_path)
+    blob = file_io.file_get_gcs_blob(file_path, True)
+    if not blob:
+        logging.error('Failed to get data from: %s', file_path)
+        raise (AssertionError(f'Failed to get data from {file_path}'))
+
+    # Compare output data with expected data.
+    output_file = 'prober_output.mcf'
+    blob.download_to_filename(output_file)
+    golden_data_file = os.path.join('test', 'data',
+                                    'treasury_constant_maturity_rates.mcf')
+    if not utils.compare_lines(golden_data_file, output_file, 50):
+        raise (AssertionError('Prober failure due to data mismatch'))
+    logging.info("Verified mcf file content for import: %s", _IMPORT_NAME)
+
+
+if __name__ == '__main__':
+    run_test()
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index f8ddfe17d7..0000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# Requirements for Python scripts in this repo that have automation enabled!
-
-absl-py
-arcgis2geojson
-dataclasses==0.6
-datacommons==1.4.3
-frozendict
-func-timeout==4.3.5
-geojson==2.5.0
-google-cloud-bigquery
-google-cloud-run
-google-cloud-storage>=2.7.0
-google-cloud-logging==3.4.0
-google-cloud-scheduler==2.10.0
-gspread==5.12.0
-lxml==4.9.1
-numpy==1.26.4
-openpyxl>=3.1.0
-pandas
-pylint
-pytest
-requests==2.27.1
-requests_cache
-retry==0.9.2
-shapely==1.8.5
-urllib3==1.26.8
-xarray==0.19.0
-xlrd
-zipp
-beautifulsoup4
-ratelimit
-xlsxwriter==3.2.0
diff --git a/requirements_all.txt b/requirements_all.txt
index 975edc670b..0a63c11075 100644
--- a/requirements_all.txt
+++ b/requirements_all.txt
@@ -1,44 +1,22 @@
 # Requirements for all Python code in this repo, except for import-automation
 
-absl-py
-arcgis2geojson
+# Add new modules to one of:
+# - import-automation/executor/requirements.txt: if the script is used in import automation.
+# - script-folder/requirements.txt: the module is only needed for that script.
+# - requirements_all.txt (here): anything not related to import automation.
+
+-r import-automation/executor/requirements.txt
+
 chembl-webresource-client>=0.10.2
-dataclasses==0.6
-datacommons==1.4.3
 deepdiff==6.3.0
 earthengine-api
 flask_restful==0.3.9
-frozendict
-func-timeout==4.3.5
-geojson==2.5.0
 geopandas==0.8.1
 geopy
-google-cloud-run
-google-cloud-bigquery
-google-cloud-storage>=2.7.0
-google-cloud-logging==3.4.0
-google-cloud-scheduler==2.10.0
-gspread==5.12.0
-lxml==4.9.1
 matplotlib==3.3.0
 netCDF4==1.6.4
-numpy==1.26.4
-openpyxl>=3.1.0
-pandas
-pylint
-pytest
 rasterio
 rdp==0.8
-requests==2.27.1
-retry==0.9.2
 s2sphere==0.2.5
-shapely==1.8.5
 tabula-py
-urllib3==1.26.8
-xarray==0.19.0
-xlrd
 yapf
-zipp
-beautifulsoup4
-ratelimit
-xlsxwriter==3.2.0
diff --git a/run_tests.sh b/run_tests.sh
index 8a527a545d..0ddb8d035e 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -17,7 +17,7 @@
 set -e
 
 # Array of top-level folders with Python code.
-PYTHON_FOLDERS="util/ scripts/ import-automation/executor"
+PYTHON_FOLDERS="util/ scripts/ import-automation/executor tools/"
 
 # Flag used signal if Python requirements have already been installed.
 PYTHON_REQUIREMENTS_INSTALLED=false
diff --git a/scripts/us_census/pep/us_pep_sex/process.py b/scripts/us_census/pep/us_pep_sex/process.py
index b28b918241..7d8771212a 100644
--- a/scripts/us_census/pep/us_pep_sex/process.py
+++ b/scripts/us_census/pep/us_pep_sex/process.py
@@ -535,7 +535,7 @@ def _state_2010_2020(file_path: str) -> pd.DataFrame:
 
         df = df[(df['Year'] != 'April2010Census') &
                 (df['Year'] != 'April2010Estimate') &
-                (df['Year'] != 'April2020')]
+                (df['Year'] != 'April2020') & (df['Year'] != '2020')]
         df['Measurement_Method'] = 'dcAggregate/CensusPEPSurvey_PartialAggregate'
         return df
     except Exception as e:
@@ -563,17 +563,21 @@ def _state_latest(file_path: str) -> pd.DataFrame:
         'July2020Male',
         'July2020Female',
     ]
-    # Adding year-specific columns dynamically till current year
-    current_year = dt.now().year
+    # Adding year-specific columns dynamically till latest year
+    df = pd.read_excel(file_path, engine='openpyxl')
+
+    fourth_row_list = df.iloc[2].tolist()
+    max_year = np.nanmax(fourth_row_list) if any(
+        not pd.isna(i) for i in fourth_row_list) else np.nan
+
+    df = pd.read_excel(file_path, skiprows=5, skipfooter=7, header=None)
+    current_year = int(max_year) + 1
     for year in range(2021, current_year):
         if current_year < 2030:
             base_columns.append(f'July{year}Total')
             base_columns.append(f'July{year}Male')
             base_columns.append(f'July{year}Female')
 
-    # Load the data with no column names initially
-    df = pd.read_excel(file_path, skiprows=5, skipfooter=7, header=None)
-
     # Assign dynamic column names
     df.columns = base_columns
 
@@ -1103,35 +1107,17 @@ def add_future_year_urls():
     # A set to track downloaded URLs for unique {YEAR} and URLs without {i}
     downloaded_year_urls = set()
 
-    # This method will generate URLs for the years 2024 to 2029
-    for future_year in range(2023, 2030):
-        if dt.now().year > future_year:
-            YEAR = future_year
-            # Loop through URLs
-            for url in urls_to_scan:
-                if "{i}" in url:  # This URL contains the {i} variable, so we loop through i from 01 to 56
-                    for i in range(1, 57):  # Loop i from 01 to 56
-                        formatted_i = f"{i:02}"  # Ensure i is always 2 digits (01, 02, ..., 56)
-                        url_to_check = url.format(YEAR=YEAR, i=formatted_i)
-
-                        try:
-                            check_url = requests.head(url_to_check,
-                                                      allow_redirects=True)
-                            if check_url.status_code == 200:
-                                _FILES_TO_DOWNLOAD.append(
-                                    {"download_path": url_to_check})
-
-                        except requests.exceptions.RequestException as e:
-                            logging.fatal(
-                                f"URL is not accessible {url_to_check} due to {e}"
-                            )
-
-                else:  # This URL does not contain {i}, so we only need to process it once per year
-                    url_to_check = url.format(YEAR=YEAR)
-
-                    # If the URL has already been processed for this year, skip it
-                    if url_to_check in downloaded_year_urls:
-                        continue  # Skip this URL if it's already processed
+    # Loop through years in reverse order from 2030 to 2023
+    for future_year in range(2030, 2022, -1):  # From 2030 to 2023
+
+        YEAR = future_year
+        # Loop through URLs
+        for url in urls_to_scan:
+            if "{i}" in url:  # This URL contains the {i} variable, so we loop through i from 01 to 56
+                for i in range(1, 57):  # Loop i from 01 to 56
+                    formatted_i = f"{i:02}"  # Ensure i is always 2 digits (01, 02, ..., 56)
+                    url_to_check = url.format(YEAR=YEAR, i=formatted_i)
+                    logging.info(f"checking url: {url_to_check}")
 
                     try:
                         check_url = requests.head(url_to_check,
@@ -1139,18 +1125,36 @@ def add_future_year_urls():
                         if check_url.status_code == 200:
                             _FILES_TO_DOWNLOAD.append(
                                 {"download_path": url_to_check})
-                            downloaded_year_urls.add(
-                                url_to_check)  # Mark this URL as processed
-
-                        else:
-                            logging.fatal(
-                                f"URL returned status code {check_url.status_code}: {url_to_check}"
-                            )
 
                     except requests.exceptions.RequestException as e:
-                        logging.fatal(
+                        logging.error(
                             f"URL is not accessible {url_to_check} due to {e}")
 
+            else:  # This URL does not contain {i}, so we only need to process it once per year
+                url_to_check = url.format(YEAR=YEAR)
+                logging.info(f"checking url: {url_to_check}")
+                # If the URL has already been processed for this year, skip it
+                if url_to_check in downloaded_year_urls:
+                    continue  # Skip this URL if it's already processed
+
+                try:
+                    check_url = requests.head(url_to_check,
+                                              allow_redirects=True)
+                    if check_url.status_code == 200:
+                        _FILES_TO_DOWNLOAD.append(
+                            {"download_path": url_to_check})
+                        downloaded_year_urls.add(
+                            url_to_check)  # Mark this URL as processed
+
+                    else:
+                        logging.error(
+                            f"URL returned status code {check_url.status_code}: {url_to_check}"
+                        )
+
+                except requests.exceptions.RequestException as e:
+                    logging.error(
+                        f"URL is not accessible {url_to_check} due to {e}")
+
 
 def download_files():
     """
diff --git a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json
index bd7e23601a..4e6bda784c 100644
--- a/scripts/us_fed/treasury_constant_maturity_rates/manifest.json
+++ b/scripts/us_fed/treasury_constant_maturity_rates/manifest.json
@@ -17,7 +17,9 @@
                     "node_mcf": "treasury_constant_maturity_rates.mcf"
                 }
             ],
-            "cron_schedule": "15 3 * * *"
+            "cron_schedule": "15 3 * * *",
+            "resource_limits": {"cpu": "1", "memory": "4G"},
+            "user_script_timeout": 1800
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/tools/import_differ/README.md b/tools/import_differ/README.md
new file mode 100644
index 0000000000..b1532600f3
--- /dev/null
+++ b/tools/import_differ/README.md
@@ -0,0 +1,32 @@
+# Import Differ
+
+This utility generates a diff (point and series analysis) of two versions of the same dataset for import analysis.
+
+**Usage**
+```
+python import_differ.py --current_data=<path> --previous_data=<path>
+```
+
+Parameter description:
+- current\_data: Path to the current MCF data (single mcf file or folder/* on local/GCS supported).
+- previous\_data: Path to the previous MCF data (single mcf file or folder/* on local/GCS supported).
+- output\_location: Path to the output data folder. Default value: results.
+- groupby\_columns: Columns to group data for diff analysis in the order var,place,time etc. Default value: “variableMeasured,observationAbout,observationDate,measureMethod,unit”.
+- value\_columns: Columns with statvar value for diff analysis. Default value: "value,scalingFactor".
+
+**Output**
+
+Summary output generated is of the form below showing counts of differences for each variable.
+
+| |variableMeasured|added|deleted|modified|same|total|
+|---|---|---|---|---|---|---|
+|0|dcid:var1|1|0|0|0|1|
+|1|dcid:var2|0|2|1|1|4|
+|2|dcid:var3|0|0|1|0|1|
+|3|dcid:var4|0|2|0|0|2|
+
+Detailed diff output is written to files for further analysis. Sample result files can be found under folder 'test/results'.
+- point\_analysis\_summary.csv: diff summry for point analysis
+- point\_analysis\_results.csv: detailed results for point analysis
+- series\_analysis\_summary.csv: diff summry for series analysis
+- series\_analysis\_results.csv: detailed results for series analysis
diff --git a/tools/import_differ/__init__.py b/tools/import_differ/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/import_differ/differ_utils.py b/tools/import_differ/differ_utils.py
new file mode 100644
index 0000000000..40a8bc6a00
--- /dev/null
+++ b/tools/import_differ/differ_utils.py
@@ -0,0 +1,94 @@
+import glob
+import os
+import pandas as pd
+import re
+
+from absl import logging
+from google.cloud.storage import Client
+
+
+def load_mcf_file(file: str) -> pd.DataFrame:
+    """ Reads an MCF text file and returns it as a dataframe."""
+    mcf_file = open(file, 'r', encoding='utf-8')
+    mcf_contents = mcf_file.read()
+    mcf_file.close()
+    # nodes separated by a blank line
+    mcf_nodes_text = mcf_contents.split('\n\n')
+    # lines seprated as property: constraint
+    mcf_line = re.compile(r'^(\w+)\s*:\s*(.*)$')
+    mcf_nodes = []
+    for node in mcf_nodes_text:
+        current_mcf_node = {}
+        for line in node.split('\n'):
+            parsed_line = mcf_line.match(line)
+            if parsed_line is not None:
+                current_mcf_node[parsed_line.group(1)] = parsed_line.group(2)
+        if current_mcf_node:
+            if current_mcf_node['typeOf'] == 'dcid:StatVarObservation':
+                mcf_nodes.append(current_mcf_node)
+            else:
+                logging.warning(
+                    f'Ignoring node of type:{current_mcf_node["typeOf"]}')
+    df = pd.DataFrame(mcf_nodes)
+    return df
+
+
+def load_mcf_files(path: str) -> pd.DataFrame:
+    """ Loads all sharded mcf files in the given directory and 
+    returns a single combined dataframe."""
+    df_list = []
+    filenames = glob.glob(path + '.mcf')
+    for filename in filenames:
+        df = load_mcf_file(filename)
+        df_list.append(df)
+    result = pd.concat(df_list, ignore_index=True)
+    return result
+
+
+def write_data(df: pd.DataFrame, path: str, file: str):
+    """ Writes a dataframe to a CSV file with the given path."""
+    out_file = open(os.path.join(path, file), mode='w', encoding='utf-8')
+    df.to_csv(out_file, index=False, mode='w')
+    out_file.close()
+
+
+def load_data(path: str, tmp_dir: str) -> pd.DataFrame:
+    """ Loads data from the given path and returns as a dataframe.
+    Args:
+      path: local or gcs path (single file or folder/* format)
+      tmp_dir: destination folder
+    Returns:
+      dataframe with the input data
+    """
+    if path.startswith('gs://'):
+        path = get_gcs_data(path, tmp_dir)
+
+    if path.endswith('*'):
+        return load_mcf_files(path)
+    else:
+        return load_mcf_file(path)
+
+
+def get_gcs_data(uri: str, tmp_dir: str) -> str:
+    """ Downloads files form GCS and copies them to local.
+    Args:
+      uri: single file path or folder/* format 
+      tmp_dir: destination folder
+    Returns:
+      path to the output file/folder
+    """
+
+    client = Client()
+    bucket = client.get_bucket(uri.split('/')[2])
+    if uri.endswith('*'):
+        blobs = client.list_blobs(bucket)
+        for blob in blobs:
+            path = os.path.join(tmp_dir, blob.name.replace('/', '_'))
+            blob.download_to_filename(path)
+        return os.path.join(tmp_dir, '*')
+    else:
+        file_name = uri.split('/')[3]
+        blob = bucket.get_blob(file_name)
+        path = os.path.join(tmp_dir, blob.name.replace('/', '_'))
+        blob.download_to_filename(path)
+        return path
diff --git a/tools/import_differ/import_differ.py b/tools/import_differ/import_differ.py
new file mode 100644
index 0000000000..21e659ba74
--- /dev/null
+++ b/tools/import_differ/import_differ.py
@@ -0,0 +1,267 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utility to generate a dataset diff for import analysis."""
+
+import os
+import pandas as pd
+import random
+
+from absl import app
+from absl import flags
+from absl import logging
+
+import differ_utils
+
+SAMPLE_COUNT = 3
+GROUPBY_COLUMNS = 'variableMeasured,observationAbout,observationDate,measurementMethod,unit,observationPeriod'
+VALUE_COLUMNS = 'value,scalingFactor'
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'current_data', '', 'Path to the current MCF data \
+  (single mcf file or folder/* on local/GCS supported).')
+flags.DEFINE_string(
+    'previous_data', '', 'Path to the previous MCF data \
+  (single mcf file or folder/* on local/GCS supported).')
+flags.DEFINE_string('output_location', 'results', \
+  'Path to the output data folder.')
+
+flags.DEFINE_string(
+    'groupby_columns', GROUPBY_COLUMNS,
+    'Columns to group data for diff analysis in the order (var,place,time etc.).'
+)
+flags.DEFINE_string('value_columns', VALUE_COLUMNS,
+                    'Columns with statvar value for diff analysis.')
+
+
+class ImportDiffer:
+    """
+  Utility to generate a diff (point and series analysis) 
+  of two versions of the same dataset for import analysis. 
+
+  Usage:
+  $ python import_differ.py --current_data=<path> --previous_data=<path>
+
+  Summary output generated is of the form below showing 
+  counts of differences for each variable.  
+
+  variableMeasured   added  deleted  modified  same  total
+  0   dcid:var1       1      0       0          0     1
+  1   dcid:var2       0      2       1          1     4
+  2   dcid:var3       0      0       1          0     1
+  3   dcid:var4       0      2       0          0     2
+
+  Detailed diff output is written to files for further analysis.
+  - point_analysis_summary.csv: diff summry for point analysis
+  - point_analysis_results.csv: detailed results for point analysis
+  - series_analysis_summary.csv: diff summry for series analysis
+  - series_analysis_results.csv: detailed results for series analysis
+
+  """
+
+    def __init__(self,
+                 current_data,
+                 previous_data,
+                 output_location,
+                 groupby_columns=GROUPBY_COLUMNS,
+                 value_columns=VALUE_COLUMNS):
+        self.current_data = current_data
+        self.previous_data = previous_data
+        self.output_location = output_location
+        self.groupby_columns = groupby_columns.split(',')
+        self.value_columns = value_columns.split(',')
+        self.variable_column = self.groupby_columns[0]
+        self.place_column = self.groupby_columns[1]
+        self.time_column = self.groupby_columns[2]
+        self.diff_column = 'diff_result'
+
+    def _cleanup_data(self, df: pd.DataFrame):
+        for column in ['added', 'deleted', 'modified', 'same']:
+            df[column] = df[column] if column in df.columns else 0
+            df[column] = df[column].fillna(0).astype(int)
+
+    def _get_samples(self, row):
+        years = sorted(row[self.time_column])
+        if len(years) > SAMPLE_COUNT:
+            return [years[0]] + random.sample(years[1:-1],
+                                              SAMPLE_COUNT - 2) + [years[-1]]
+        else:
+            return years
+
+    # Processes two dataset files to identify changes.
+    def process_data(self, previous_df: pd.DataFrame,
+                     current_df: pd.DataFrame) -> pd.DataFrame:
+        """ 
+        Process previous and current datasets to generate 
+        the intermediate data for point and series analysis.
+        Args:
+          current_df: dataframe with current (new) data
+          previous_df: dataframe with previous (old) data
+        Returns:
+          intermediate merged data for analysis
+        """
+        cur_df_columns = current_df.columns.values.tolist()
+        self.groupby_columns = [
+            i for i in self.groupby_columns if i in cur_df_columns
+        ]
+        self.value_columns = [
+            i for i in self.value_columns if i in cur_df_columns
+        ]
+        df1 = previous_df.loc[:, self.groupby_columns + self.value_columns]
+        df2 = current_df.loc[:, self.groupby_columns + self.value_columns]
+        df1['_value_combined'] = df1[self.value_columns]\
+          .apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
+        df2['_value_combined'] = df2[self.value_columns]\
+          .apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
+        df1.drop(columns=self.value_columns, inplace=True)
+        df2.drop(columns=self.value_columns, inplace=True)
+        # Perform outer join operation to identify differences.
+        result = pd.merge(df1,
+                          df2,
+                          on=self.groupby_columns,
+                          how='outer',
+                          indicator=self.diff_column)
+        result[self.diff_column] = result.apply(
+          lambda row: 'added' if row[self.diff_column] == 'right_only' \
+          else 'deleted' if row[self.diff_column] == 'left_only' \
+          else 'modified' if row['_value_combined_x'] != row['_value_combined_y'] \
+          else 'same', axis=1)
+        return result
+
+    def point_analysis(self,
+                       in_data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
+        """ 
+        Performs point diff analysis to identify data point changes.
+        Args:
+          in_data: intermediate data generated by processing previous/current data
+        Returns:
+          summary and results from the analysis
+        """
+        column_list = [
+            self.variable_column, self.place_column, self.time_column,
+            self.diff_column
+        ]
+        result = in_data.loc[:, column_list]
+        result = result.groupby(
+            [self.variable_column, self.diff_column],
+            observed=True,
+            as_index=False)[[self.place_column,
+                             self.time_column]].agg(lambda x: x.tolist())
+        result['size'] = result.apply(lambda row: len(row[self.place_column]),
+                                      axis=1)
+        result[self.place_column] = result.apply(lambda row: random.sample(
+            row[self.place_column],
+            min(SAMPLE_COUNT, len(row[self.place_column]))),
+                                                 axis=1)
+        result[self.time_column] = result.apply(self._get_samples, axis=1)
+        summary = result.pivot(
+          index=self.variable_column, columns=self.diff_column, values='size')\
+          .reset_index().rename_axis(None, axis=1)
+        self._cleanup_data(summary)
+        summary['total'] = summary.apply(lambda row: row['added'] + row[
+            'deleted'] + row['modified'] + row['same'],
+                                         axis=1)
+        return summary, result
+
+    def series_analysis(self,
+                        in_data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
+        """ 
+        Performs series diff analysis to identify time series changes.
+        Args:
+          in_data: intermediate data generated by processing previous/current data
+        Returns:
+          summary and results from the analysis
+        """
+        column_list = [
+            self.variable_column, self.place_column, self.diff_column
+        ]
+        result = in_data.loc[:, column_list]
+        result = result.groupby(column_list, as_index=False).size()
+        result = result.pivot(
+          index=[self.variable_column, self.place_column], columns=self.diff_column, values='size')\
+          .reset_index().rename_axis(None, axis=1)
+        self._cleanup_data(result)
+        result[self.diff_column] = result.apply(lambda row: 'added' if row['added'] > 0 \
+          and row['deleted'] + row['modified'] + row['same'] == 0 \
+          else 'deleted' if row['deleted'] > 0 and row['added'] + row['modified'] + row['same'] == 0 \
+          else 'modified' if row['deleted'] > 0 or row['added'] > 0 or row['modified'] > 0 \
+          else 'same', axis=1)
+        result = result[column_list]
+        result = result.groupby(
+            [self.variable_column, self.diff_column],
+            observed=True,
+            as_index=False)[self.place_column].agg(lambda x: x.tolist())
+        result['size'] = result.apply(lambda row: len(row[self.place_column]),
+                                      axis=1)
+        result[self.place_column] = result.apply(lambda row: random.sample(
+            row[self.place_column],
+            min(SAMPLE_COUNT, len(row[self.place_column]))),
+                                                 axis=1)
+        summary = result.pivot(
+          index=self.variable_column, columns=self.diff_column, values='size')\
+          .reset_index().rename_axis(None, axis=1)
+        self._cleanup_data(summary)
+        summary['total'] = summary.apply(lambda row: row['added'] + row[
+            'deleted'] + row['modified'] + row['same'],
+                                         axis=1)
+        return summary, result
+
+    def run_differ(self):
+        if not os.path.exists(self.output_location):
+            os.makedirs(self.output_location)
+        logging.info('Loading data...')
+        current_df = differ_utils.load_data(self.current_data,
+                                            self.output_location)
+        previous_df = differ_utils.load_data(self.previous_data,
+                                             self.output_location)
+
+        logging.info('Processing data...')
+        in_data = self.process_data(previous_df, current_df)
+
+        logging.info('Point analysis:')
+        summary, result = self.point_analysis(in_data)
+        result.sort_values(by=[self.diff_column, self.variable_column],
+                           inplace=True)
+        print(summary.head(10))
+        print(result.head(10))
+        differ_utils.write_data(summary, self.output_location,
+                                'point_analysis_summary.csv')
+        differ_utils.write_data(result, self.output_location,
+                                'point_analysis_results.csv')
+
+        logging.info('Series analysis:')
+        summary, result = self.series_analysis(in_data)
+        result.sort_values(by=[self.diff_column, self.variable_column],
+                           inplace=True)
+        print(summary.head(10))
+        print(result.head(10))
+        differ_utils.write_data(summary, self.output_location,
+                                'series_analysis_summary.csv')
+        differ_utils.write_data(result, self.output_location,
+                                'series_analysis_results.csv')
+
+        logging.info('Differ output written to folder: %s',
+                     self.output_location)
+
+
+def main(_):
+    '''Runs the differ.'''
+    differ = ImportDiffer(FLAGS.current_data, FLAGS.previous_data,
+                          FLAGS.output_location, FLAGS.groupby_columns,
+                          FLAGS.value_columns)
+    differ.run_differ()
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/tools/import_differ/import_differ_test.py b/tools/import_differ/import_differ_test.py
new file mode 100644
index 0000000000..60137119d8
--- /dev/null
+++ b/tools/import_differ/import_differ_test.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import unittest
+
+from pandas.testing import assert_frame_equal
+from import_differ import ImportDiffer
+
+import differ_utils
+
+module_dir = os.path.dirname(__file__)
+
+
+class TestImportDiffer(unittest.TestCase):
+    '''
+  Test Class to compare expected output in test/ directory to the
+  output generated by ImportDiffer class
+  '''
+
+    def test_diff_analysis(self):
+        groupby_columns = 'variableMeasured,observationAbout,observationDate'
+        value_columns = 'value'
+        current_data = os.path.join(module_dir, 'test', 'current.mcf')
+        previous_data = os.path.join(module_dir, 'test', 'previous.mcf')
+        output_location = os.path.join(module_dir, 'test')
+
+        differ = ImportDiffer(current_data, previous_data, output_location,
+                              groupby_columns, value_columns)
+        current = differ_utils.load_mcf_file(current_data)
+        previous = differ_utils.load_mcf_file(previous_data)
+
+        in_data = differ.process_data(previous, current)
+        summary, result = differ.point_analysis(in_data)
+        result = pd.read_csv(os.path.join(module_dir, 'test', 'result1.csv'))
+        assert_frame_equal(summary, result)
+
+        summary, result = differ.series_analysis(in_data)
+        result = pd.read_csv(os.path.join(module_dir, 'test', 'result2.csv'))
+        assert_frame_equal(summary, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/import_differ/test/current.mcf b/tools/import_differ/test/current.mcf
new file mode 100644
index 0000000000..2e994a7a45
--- /dev/null
+++ b/tools/import_differ/test/current.mcf
@@ -0,0 +1,35 @@
+Node: cpcb_air_quality/E17/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Max_Concentration_AirPollutant_Ozone
+value: 53.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/bhdp3vy7dee0d"
+
+Node: cpcb_air_quality/E18/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Mean_Concentration_AirPollutant_Ozone
+value: 28.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/8e11gqvkt183b"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 42.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-25T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 40.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
diff --git a/tools/import_differ/test/previous.mcf b/tools/import_differ/test/previous.mcf
new file mode 100644
index 0000000000..ce9fcb31d1
--- /dev/null
+++ b/tools/import_differ/test/previous.mcf
@@ -0,0 +1,62 @@
+Node: cpcb_air_quality/E18/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Mean_Concentration_AirPollutant_Ozone
+value: 29.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/8e11gqvkt183b"
+
+Node: cpcb_air_quality/E16/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Min_Concentration_AirPollutant_Ozone
+value: 18.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/z8j7g5sw11klh"
+
+Node: cpcb_air_quality/E16/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD
+variableMeasured: dcid:Min_Concentration_AirPollutant_Ozone
+value: 18.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/z8j7g5sw11klh"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 41.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-25T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___APPCB
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 40.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-24T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 41.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
+
+Node: cpcb_air_quality/E15/944d9e6d-ec38-4e61-175a-9bbabfd35f97
+observationDate: "2024-09-25T12:00:00"
+unit: dcid:MicrogramsPerCubicMeter
+observationAbout: dcid:cpcpAq/Secretariat_Amaravati___IMD
+variableMeasured: dcid:Mean_Concentration_AirPollutant_CO
+value: 40.0
+typeOf: dcid:StatVarObservation
+dcid: "dc/o/h1sjhdxycwwmc"
diff --git a/tools/import_differ/test/result1.csv b/tools/import_differ/test/result1.csv
new file mode 100644
index 0000000000..4d344b5639
--- /dev/null
+++ b/tools/import_differ/test/result1.csv
@@ -0,0 +1,5 @@
+variableMeasured,added,deleted,modified,same,total
+dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1
+dcid:Mean_Concentration_AirPollutant_CO,0,2,1,1,4
+dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1
+dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2
diff --git a/tools/import_differ/test/result2.csv b/tools/import_differ/test/result2.csv
new file mode 100644
index 0000000000..4f3b954643
--- /dev/null
+++ b/tools/import_differ/test/result2.csv
@@ -0,0 +1,5 @@
+variableMeasured,added,deleted,modified,same,total
+dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1
+dcid:Mean_Concentration_AirPollutant_CO,0,1,1,0,2
+dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1
+dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2
diff --git a/tools/import_differ/test/results/point_analysis_results.csv b/tools/import_differ/test/results/point_analysis_results.csv
new file mode 100644
index 0000000000..80feb425a5
--- /dev/null
+++ b/tools/import_differ/test/results/point_analysis_results.csv
@@ -0,0 +1,7 @@
+variableMeasured,_diff_result,observationAbout,observationDate,size
+dcid:Max_Concentration_AirPollutant_Ozone,added,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],"['""2024-09-24T12:00:00""']",1
+dcid:Mean_Concentration_AirPollutant_CO,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___APPCB', 'dcid:cpcpAq/Secretariat_Amaravati___APPCB']","['""2024-09-24T12:00:00""', '""2024-09-25T12:00:00""']",2
+dcid:Min_Concentration_AirPollutant_Ozone,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___APPCB', 'dcid:cpcpAq/Secretariat_Amaravati___IMD']","['""2024-09-24T12:00:00""', '""2024-09-24T12:00:00""']",2
+dcid:Mean_Concentration_AirPollutant_CO,modified,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],"['""2024-09-24T12:00:00""']",1
+dcid:Mean_Concentration_AirPollutant_Ozone,modified,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],"['""2024-09-24T12:00:00""']",1
+dcid:Mean_Concentration_AirPollutant_CO,same,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],"['""2024-09-25T12:00:00""']",1
diff --git a/tools/import_differ/test/results/point_analysis_summary.csv b/tools/import_differ/test/results/point_analysis_summary.csv
new file mode 100644
index 0000000000..4d344b5639
--- /dev/null
+++ b/tools/import_differ/test/results/point_analysis_summary.csv
@@ -0,0 +1,5 @@
+variableMeasured,added,deleted,modified,same,total
+dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1
+dcid:Mean_Concentration_AirPollutant_CO,0,2,1,1,4
+dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1
+dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2
diff --git a/tools/import_differ/test/results/series_analysis_results.csv b/tools/import_differ/test/results/series_analysis_results.csv
new file mode 100644
index 0000000000..b776dbd2f5
--- /dev/null
+++ b/tools/import_differ/test/results/series_analysis_results.csv
@@ -0,0 +1,6 @@
+variableMeasured,_diff_result,observationAbout,size
+dcid:Max_Concentration_AirPollutant_Ozone,added,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1
+dcid:Mean_Concentration_AirPollutant_CO,deleted,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1
+dcid:Min_Concentration_AirPollutant_Ozone,deleted,"['dcid:cpcpAq/Secretariat_Amaravati___IMD', 'dcid:cpcpAq/Secretariat_Amaravati___APPCB']",2
+dcid:Mean_Concentration_AirPollutant_CO,modified,['dcid:cpcpAq/Secretariat_Amaravati___IMD'],1
+dcid:Mean_Concentration_AirPollutant_Ozone,modified,['dcid:cpcpAq/Secretariat_Amaravati___APPCB'],1
diff --git a/tools/import_differ/test/results/series_analysis_summary.csv b/tools/import_differ/test/results/series_analysis_summary.csv
new file mode 100644
index 0000000000..4f3b954643
--- /dev/null
+++ b/tools/import_differ/test/results/series_analysis_summary.csv
@@ -0,0 +1,5 @@
+variableMeasured,added,deleted,modified,same,total
+dcid:Max_Concentration_AirPollutant_Ozone,1,0,0,0,1
+dcid:Mean_Concentration_AirPollutant_CO,0,1,1,0,2
+dcid:Mean_Concentration_AirPollutant_Ozone,0,0,1,0,1
+dcid:Min_Concentration_AirPollutant_Ozone,0,2,0,0,2
diff --git a/tools/import_validation/import_validation.py b/tools/import_validation/import_validation.py
new file mode 100644
index 0000000000..8dee351aeb
--- /dev/null
+++ b/tools/import_validation/import_validation.py
@@ -0,0 +1,140 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Class to perform validations for import automation."""
+
+from absl import app
+from absl import flags
+from absl import logging
+from enum import Enum
+import pandas as pd
+import os
+import json
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('config_file', 'validation_config.json',
+                    'Path to the validation config file.')
+flags.DEFINE_string('differ_output_location', '.',
+                    'Path to the differ output data folder.')
+flags.DEFINE_string('stats_summary_location', '.',
+                    'Path to the stats summary report folder.')
+flags.DEFINE_string('validation_output_location', '.',
+                    'Path to the validation output folder.')
+
+POINT_ANALAYSIS_FILE = 'point_analysis_summary.csv'
+STATS_SUMMARY_FILE = 'summary_report.csv'
+VALIDATION_OUTPUT_FILE = 'validation_output.csv'
+
+Validation = Enum('Validation', [
+    ('MODIFIED_COUNT', 1),
+    ('UNMODIFIED_COUNT', 2),
+    ('ADDED_COUNT', 3),
+    ('DELETED_COUNT', 4),
+    ('LATEST_DATA', 5),
+])
+
+
+class ValidationResult:
+    """Describes the result of the validaiton of an import."""
+
+    def __init__(self, status, name, message):
+        # Status of the execution: PASSED OR FAILED
+        self.status = status
+        # Name of the validaiton executed
+        self.name = name
+        # Description of the result/error message
+        self.message = message
+
+
+class ImportValidation:
+    """
+  Class to perform validations for import automation.
+
+  Usage:
+  $ python import_validation.py --config_file=<path> \
+    --differ_output_location=<path> --stats_summary_location=<path>
+
+  Each import can provide configuration (JSON) to select which validation
+  checks are performed. Validation results are written to an output file.
+  Sample config and output files can be found in test folder.
+  """
+
+    def __init__(self, config_file: str, differ_output: str, stats_summary: str,
+                 validation_output: str):
+        logging.info('Reading config from %s', config_file)
+        self.differ_results = pd.read_csv(differ_output)
+        self.validation_map = {
+            Validation.MODIFIED_COUNT: self._modified_count_validation,
+            Validation.ADDED_COUNT: self._added_count_validation,
+            Validation.DELETED_COUNT: self._deleted_count_validation,
+            Validation.UNMODIFIED_COUNT: self._unmodified_count_validation
+        }
+        self.validation_output = validation_output
+        self.validation_result = []
+        with open(config_file, encoding='utf-8') as fd:
+            self.validation_config = json.load(fd)
+
+    def _latest_data_validation(self, config: dict):
+        logging.info('Not yet implemented')
+
+    # Checks if the number of deleted data points are below a threshold.
+    def _deleted_count_validation(self, config: dict):
+        if self.differ_results['deleted'].sum() > config['threshold']:
+            raise AssertionError(f'Validation failed: {config["validation"]}')
+
+    # Checks if number of modified points for each stat var are same.
+    def _modified_count_validation(self, config: dict):
+        if self.differ_results['modified'].nunique() > 1:
+            raise AssertionError(f'Validation failed: {config["validation"]}')
+
+    # Checks if number of added points for each stat var are same.
+    def _added_count_validation(self, config: dict):
+        if self.differ_results['added'].nunique() > 1:
+            raise AssertionError(f'Validation failed: {config["validation"]}')
+
+    # Checks if number of unmodified points for each stat var are same.
+    def _unmodified_count_validation(self, config: dict):
+        if self.differ_results['same'].nunique() > 1:
+            raise AssertionError(f'Validation failed: {config["validation"]}')
+
+    def _run_validation(self, config) -> ValidationResult:
+        try:
+            self.validation_map[Validation[config['validation']]](config)
+            logging.info('Validation passed: %s', config['validation'])
+            return ValidationResult('PASSED', config['validation'], '')
+        except AssertionError as exc:
+            logging.error(repr(exc))
+            return ValidationResult('FAILED', config['validation'], repr(exc))
+
+    def run_validations(self):
+        output_file = open(self.validation_output, mode='w', encoding='utf-8')
+        output_file.write('test,status,message\n')
+        for config in self.validation_config:
+            result = self._run_validation(config)
+            output_file.write(
+                f'{result.name},{result.status},{result.message}\n')
+            self.validation_result.append(result)
+        output_file.close()
+
+
+def main(_):
+    validation = ImportValidation(
+        FLAGS.config_file,
+        os.path.join(FLAGS.differ_output_location, POINT_ANALAYSIS_FILE),
+        os.path.join(FLAGS.stats_summary_location, STATS_SUMMARY_FILE),
+        os.paht.join(FLAGS.validation_output_location, VALIDATION_OUTPUT_FILE))
+    validation.run_validations()
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/tools/import_validation/import_validation_test.py b/tools/import_validation/import_validation_test.py
new file mode 100644
index 0000000000..a33ca72d66
--- /dev/null
+++ b/tools/import_validation/import_validation_test.py
@@ -0,0 +1,45 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pandas as pd
+import unittest
+
+from pandas.testing import assert_frame_equal
+from import_validation import ImportValidation
+
+module_dir = os.path.dirname(__file__)
+
+
+class TestValidation(unittest.TestCase):
+    '''
+  Test Class to run compare expected output in test/ directory to the
+  output generated by ImportValidation class
+  '''
+
+    def test_validation(self):
+        result_file = os.path.join(module_dir, 'test', 'test_output.csv')
+        config_file = os.path.join(module_dir, 'test', 'test_config.json')
+        differ_output = os.path.join(module_dir, 'test', 'differ_output.csv')
+
+        validation = ImportValidation(config_file, differ_output, '')
+        validation.run_validations()
+
+        expected = pd.read_csv(result_file)
+        actual = pd.read_csv(os.path.join(module_dir, 'validation_output.csv'))
+        assert_frame_equal(actual, expected)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/import_validation/test/differ_output.csv b/tools/import_validation/test/differ_output.csv
new file mode 100644
index 0000000000..280df7aedb
--- /dev/null
+++ b/tools/import_validation/test/differ_output.csv
@@ -0,0 +1,5 @@
+variableMeasured,added,deleted,modified,same,total
+dcid:var1,1,0,0,3,4
+dcid:var2,1,0,0,2,4
+dcid:var3,1,0,0,3,4
+dcid:var4,1,0,0,3,4
diff --git a/tools/import_validation/test/test_config.json b/tools/import_validation/test/test_config.json
new file mode 100644
index 0000000000..21daba858b
--- /dev/null
+++ b/tools/import_validation/test/test_config.json
@@ -0,0 +1,15 @@
+[
+    {
+        "validation": "DELETED_COUNT",
+        "threshold": 1
+    },
+    {
+        "validation": "MODIFIED_COUNT"
+    },
+    {
+        "validation": "ADDED_COUNT"
+    },
+    {
+        "validation": "UNMODIFIED_COUNT"
+    }
+]
diff --git a/tools/import_validation/test/test_output.csv b/tools/import_validation/test/test_output.csv
new file mode 100644
index 0000000000..a8327651b2
--- /dev/null
+++ b/tools/import_validation/test/test_output.csv
@@ -0,0 +1,5 @@
+test,status,message
+DELETED_COUNT,PASSED,
+MODIFIED_COUNT,PASSED,
+ADDED_COUNT,PASSED,
+UNMODIFIED_COUNT,FAILED,AssertionError('Validation failed: UNMODIFIED_COUNT')
\ No newline at end of file
diff --git a/tools/import_validation/validation_config.json b/tools/import_validation/validation_config.json
new file mode 100644
index 0000000000..21daba858b
--- /dev/null
+++ b/tools/import_validation/validation_config.json
@@ -0,0 +1,15 @@
+[
+    {
+        "validation": "DELETED_COUNT",
+        "threshold": 1
+    },
+    {
+        "validation": "MODIFIED_COUNT"
+    },
+    {
+        "validation": "ADDED_COUNT"
+    },
+    {
+        "validation": "UNMODIFIED_COUNT"
+    }
+]
diff --git a/tools/statvar_importer/config_flags.py b/tools/statvar_importer/config_flags.py
new file mode 100644
index 0000000000..39c37ddca6
--- /dev/null
+++ b/tools/statvar_importer/config_flags.py
@@ -0,0 +1,544 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Class to store configuration parameters as a dictionary."""
+
+import ast
+from collections import OrderedDict
+import collections.abc
+import os
+import sys
+from typing import Union
+
+from absl import app
+from absl import flags
+from absl import logging
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+sys.path.append(os.path.dirname(_SCRIPT_DIR))
+sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR)))
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util'))
+
+import file_util
+from config_map import ConfigMap
+from mcf_file_util import get_numeric_value
+
+_FLAGS = flags.FLAGS
+
+flags.DEFINE_string('config_file', '', 'File with configuration parameters.')
+flags.DEFINE_list('data_url', '', 'URLs to download the data from.')
+flags.DEFINE_string('shard_input_by_column', '',
+                    'Shard input data by unique values in column.')
+flags.DEFINE_integer(
+    'shard_prefix_length',
+    sys.maxsize,
+    'Shard input data by value prefix of given length.',
+)
+flags.DEFINE_list(
+    'pv_map', [],
+    'Comma separated list of namespace:file with property values.')
+flags.DEFINE_list('input_data', [],
+                  'Comma separated list of data files to be processed.')
+flags.DEFINE_string('input_encoding', 'utf-8', 'Encoding for input_data files.')
+flags.DEFINE_list(
+    'input_xls_sheets',
+    [],
+    'Comma separated list of sheet names within input_data xls files to be processed.',
+)
+flags.DEFINE_integer('input_rows', sys.maxsize,
+                     'Number of rows per input file to process.')
+flags.DEFINE_integer('input_columns', sys.maxsize,
+                     'Number of columns in input file to process.')
+flags.DEFINE_integer(
+    'skip_rows', 0, 'Number of rows to skip at the begining of the input file.')
+flags.DEFINE_integer(
+    'header_rows',
+    -1,
+    'Number of header rows with property-value mappings for columns. If -1,'
+    ' will lookup PVs for all rows.',
+)
+flags.DEFINE_integer(
+    'header_columns',
+    -1,
+    'Number of header columns with property-value mappings for rows. If -1,'
+    ' will lookup PVs for all columns.',
+)
+flags.DEFINE_string(
+    'aggregate_duplicate_svobs',
+    None,
+    'Aggregate SVObs with same place, date by one of the following: sum, min or'
+    ' max.',
+)
+flags.DEFINE_bool('schemaless', False, 'Allow schemaless StatVars.')
+flags.DEFINE_string('output_path', '',
+                    'File prefix for output mcf, csv and tmcf.')
+flags.DEFINE_string(
+    'existing_statvar_mcf',
+    '',
+    'StatVar MCF files for any existing stat var nodes to be resused.',
+)
+flags.DEFINE_string(
+    'existing_schema_mcf',
+    '',
+    'StatVar MCF files for any existing schema nodes to be resused.',
+)
+flags.DEFINE_integer('parallelism', 0, 'Number of parallel processes to use.')
+flags.DEFINE_integer('pprof_port', 0, 'HTTP port for pprof server.')
+flags.DEFINE_bool('debug', False, 'Enable debug messages.')
+flags.DEFINE_integer('log_level', logging.INFO,
+                     'Log level messages to be shown.')
+
+# Flags for place name resolution
+flags.DEFINE_string('dc_api_key', '',
+                    'DataCommons v2 API key used for APIs such as v2/resolve')
+flags.DEFINE_string('maps_api_key', '',
+                    'Maps API key for place lookup by name.')
+flags.DEFINE_list('places_csv', [],
+                  'CSV file with place names and dcids to match.')
+flags.DEFINE_string(
+    'places_resolved_csv',
+    '',
+    'CSV file with resolved place names and dcids to match.',
+)
+flags.DEFINE_list('place_type', [], 'List of places types for name reoslution.')
+flags.DEFINE_list('places_within', [],
+                  'List of places types for name reoslution.')
+flags.DEFINE_string(
+    'statvar_dcid_remap_csv',
+    '',
+    'CSV file with existing DCIDs for generated statvars.',
+)
+flags.DEFINE_string('output_counters', '', 'CSV file with counters.')
+
+flags.DEFINE_bool(
+    'resume',
+    False,
+    'Resume processing to create output files not yet generated.',
+)
+
+# Flags for spell checks
+_DEFAULT_SPELL_ALLOWLIST = os.path.join(_SCRIPT_DIR, 'words_allowlist.txt')
+flags.DEFINE_bool('spell_check', True, 'Run schema spell checker')
+flags.DEFINE_string('sanity_check_output', '', 'File with list of spell errors')
+flags.DEFINE_string('spell_check_allow_list', _DEFAULT_SPELL_ALLOWLIST,
+                    'File with words to be allowed')
+flags.DEFINE_string('spell_check_config', '', 'File with words to be allowed')
+flags.DEFINE_bool('spell_check_text', False,
+                  'if True, spell check quoted text values only.')
+flags.DEFINE_list('spell_check_ignore_props', None,
+                  'List of properties to ignore for spell check.')
+
+# Flags for pvmap generation
+flags.DEFINE_bool('generate_pvmap', True, 'Generate PVmap')
+flags.DEFINE_string('google_genai_key', '', 'Google API key for GenAI prompt.')
+flags.DEFINE_string('sample_pvmap', os.path.join(_SCRIPT_DIR,
+                                                 'sample_pvmap.csv'),
+                    'Sample PVmap for gen AI.')
+flags.DEFINE_string('sample_statvars',
+                    os.path.join(_SCRIPT_DIR, 'sample_statvars.mcf'),
+                    'Sample statvars MCF for GenAI prompt.')
+flags.DEFINE_string('data_context', '',
+                    'Text file with metadata descriptions for data.')
+flags.DEFINE_bool('generate_statvar_name', False,
+                  'Generate names for Statvars.')
+flags.DEFINE_bool('llm_generate_statvar_name', False,
+                  'Generate names for Statvars.')
+
+
+def get_default_config() -> dict:
+    """Returns the default config as dictionary of config parameters and values."""
+    return {
+        # 'config parameter in snake_case': value
+        'ignore_numeric_commas':
+            True,  # Numbers may have commas
+        'input_reference_column':
+            '#input',
+        'input_min_columns_per_row':
+            2,
+        'input_data':
+            _FLAGS.input_data,
+        'data_url':
+            _FLAGS.data_url,
+        'input_encoding':
+            _FLAGS.input_encoding,
+        'input_xls':
+            _FLAGS.input_xls,
+        'pv_map_drop_undefined_nodes':
+            (False),  # Don't drop undefined PVs in the column PV Map.
+        'duplicate_svobs_key':
+            '#ErrorDuplicateSVObs',
+        'duplicate_statvars_key':
+            '#ErrorDuplicateStatVar',
+        'drop_statvars_without_svobs':
+            1,
+        # Aggregate values for duplicate SVObs with the same statvar, place, date
+        # and units with one of the following functions:
+        #   sum: Add all values.
+        #   min: Set the minimum value.
+        #   max: Set the maximum value.
+        # Internal property in PV map to aggregate values for a specific statvar.
+        'aggregate_key':
+            '#Aggregate',
+        # Aggregation type duplicate SVObs for all statvars.
+        'aggregate_duplicate_svobs':
+            _FLAGS.aggregate_duplicate_svobs,
+        'merged_pvs_property':
+            '#MergedSVObs',
+        'multi_value_properties': [
+            'name', 'alternateName', 'measurementDenominator'
+        ],
+        # Enable schemaless StatVars,
+        # If True, allow statvars with capitalized property names.
+        # Those properties are commented out when generating MCF but used for
+        # statvar dcid.
+        'schemaless':
+            _FLAGS.schemaless,
+        # Whether to lookup DC API and drop undefined PVs in statvars.
+        'schemaless_statvar_comment_undefined_pvs':
+            False,
+        'default_statvar_pvs':
+            OrderedDict({
+                'typeOf': 'dcs:StatisticalVariable',
+                'measurementQualifier': '',
+                'statType': 'dcs:measuredValue',
+                'measuredProperty': 'dcs:count',
+                'populationType': '',
+                'memberOf': '',
+                'name': '',
+                'nameWithLanguage': '',
+                'alternateName': '',
+                'description': '',
+                'descriptionUrl': '',
+            }),
+        'statvar_dcid_ignore_properties': [
+            'description', 'name', 'nameWithLanguage', 'descriptionUrl',
+            'alternateName'
+        ],
+        'statvar_dcid_ignore_values': ['measuredValue', 'StatisticalVariable'],
+        'default_svobs_pvs':
+            OrderedDict({
+                'typeOf': 'dcs:StatVarObservation',
+                'observationDate': '',
+                'observationAbout': '',
+                'value': '',
+                'observationPeriod': '',
+                'measurementMethod': '',
+                'unit': '',
+                'scalingFactor': '',
+                'variableMeasured': '',
+                'measurementResult': '',
+                '#Aggregate': '',
+            }),
+        'required_statvar_properties': [
+            'measuredProperty',
+            'populationType',
+        ],
+        'required_statvarobs_properties': [
+            'variableMeasured',
+            'observationAbout',
+            'observationDate',
+            'value',
+        ],
+        # Settings to compare StatVars with existing statvars to reuse dcids.
+        'existing_statvar_mcf':
+            _FLAGS.existing_statvar_mcf,
+        'existing_schema_mcf':
+            _FLAGS.existing_schema_mcf,
+        'statvar_fingerprint_ignore_props': [
+            'Node',
+            'dcid',
+            'name',
+            'nameWithLanguage',
+            'alternateName',
+            'description',
+            'descriptionUrl',
+            'provenance',
+            'memberOf',
+            'member',
+            'relevantVariable',
+        ],
+        'statvar_fingerprint_include_props': [],
+        # File with generated DCIDs remapped to existing dcids.
+        # This is used for schemaless statvars that can't be matched with
+        # existing statvars using property:value
+        'statvar_dcid_remap_csv':
+            _FLAGS.statvar_dcid_remap_csv,
+        # Use numeric data in any column as a value.
+        # It may still be dropped if no SVObs can be constructed out of it.
+        # If False, SVObs is only emitted for PVs that have a map for 'value',
+        # for example, 'MyColumn': { 'value': '@Data' }
+        'use_all_numeric_data_values':
+            False,
+        # Number format in input.
+        'number_decimal':
+            '.',  # decimal character
+        'number_separator':
+            ', ',  # separators stripped.
+        # Word separator, used to split words into phrases for PV map lookups.
+        'word_delimiter':
+            ' ',
+        # Enable merged cells that inherit PVs from previous column.
+        'merged_cells':
+            True,
+        # List of default PVS maps to lookup column values if there is no map for a
+        # column name.
+        'default_pv_maps': ['GLOBAL'],
+        # Row and column indices with content to be looked up in pv_maps.
+        'mapped_rows':
+            0,
+        'mapped_columns': [],
+        'show_counters_every_n':
+            0,
+        'show_counters_every_sec':
+            30,
+        # Settings for place name resolution
+        'dc_api_key':
+            _FLAGS.dc_api_key,
+        'maps_api_key':
+            _FLAGS.maps_api_key,
+        'resolve_places':
+            False,
+        'places_csv':
+            _FLAGS.places_csv,
+        'places_resolved_csv':
+            _FLAGS.places_resolved_csv,
+        'place_type':
+            _FLAGS.place_type,
+        'places_within':
+            _FLAGS.places_within,
+
+        # Filter settings
+        'filter_data_min_value':
+            None,
+        'filter_data_max_value':
+            None,
+        'filter_data_max_change_ratio':
+            None,
+        'filter_data_max_yearly_change_ratio':
+            None,
+
+        # Output options
+        'output_path':
+            _FLAGS.output_path,
+        'generate_statvar_mcf':
+            True,  # Generate MCF file with all statvars
+        'generate_csv':
+            True,  # Generate CSV with SVObs
+        'output_csv_mode':
+            'w',  # Overwrite output CSV file.
+        'output_columns': [],  # Emit all SVObs PVs into output csv
+        'generate_tmcf':
+            True,  # Generate tMCF for CSV columns
+        'skip_constant_csv_columns':
+            (True),  # Skip emitting columns with constant values in the csv
+        'output_only_new_statvars':
+            True,  # Drop existing statvars from output
+        'output_precision_digits':
+            5,  # Round floating values to 5 decimal digits.
+        'generate_schema_mcf':
+            True,
+        'generate_provisional_schema':
+            True,
+        # Settings for DC API.
+        'dc_api_root':
+            'http://autopush.api.datacommons.org',
+        'dc_api_use_cache':
+            False,
+        'dc_api_batch_size':
+            100,
+        # Settings from flags
+        'pv_map':
+            _FLAGS.pv_map,
+        'input_rows':
+            _FLAGS.input_rows,
+        'input_columns':
+            _FLAGS.input_columns,
+        'skip_rows':
+            _FLAGS.skip_rows,
+        'ignore_rows': [0],
+        'header_rows':
+            _FLAGS.header_rows,
+        'header_columns':
+            _FLAGS.header_columns,
+        'process_rows': [0],
+        'parallelism':
+            _FLAGS.parallelism,
+        'output_counters':
+            _FLAGS.output_counters,
+
+        # Settings for spell checks
+        'spell_check':
+            _FLAGS.spell_check,
+        'spell_allowlist':
+            _FLAGS.spell_check_allow_list,
+        'spell_allow_words': [],
+        'output_sanity_check':
+            _FLAGS.sanity_check_output,
+        'spell_check_text_only':
+            _FLAGS.spell_check_text,
+        'spell_check_ignore_props':
+            _FLAGS.spell_check_ignore_props,
+        'debug':
+            _FLAGS.debug,
+        'log_level':
+            _FLAGS.log_level,
+
+        # Settings for PV Map generator
+        'generate_pvmap':
+            _FLAGS.generate_pvmap,
+        'google_api_key':
+            _FLAGS.google_genai_key,
+        'sample_pvmap':
+            _FLAGS.sample_pvmap,
+        'sample_statvars':
+            _FLAGS.sample_statvars,
+        'data_context':
+            _FLAGS.data_context,
+        'llm_data_annotation':
+            _FLAGS.generate_pvmap,
+
+        # Settings for statvar name generator
+        'generate_statvar_name':
+            _FLAGS.generate_statvar_name,  # Generate names for StatVars
+        'llm_generate_statvar_name':
+            _FLAGS.llm_generate_statvar_name,
+    }
+
+
+def init_config_from_flags(filename: str = None) -> ConfigMap:
+    """Returns a Config object with parameters loaded from a file.
+
+  Args:
+    filename: name of the file to load.
+
+  Returns:
+    Config object with all the parameters loaded into the config_dict.
+  """
+    config_dict = dict(get_default_config())
+    if isinstance(filename, dict):
+        config_dict.update(filename)
+        filename = None
+    elif isinstance(filename, ConfigMap):
+        config_dict.update(filename.get_configs())
+    elif isinstance(filename, str):
+        file_config = {}
+        # Check if filename is a file.
+        config_files = file_util.file_get_matching(filename)
+        if config_files:
+            # Load config from file.
+            file_config = file_util.file_load_py_dict(config_files)
+        elif ':' in filename:
+            # Try parsing config as a string
+            file_config = _parse_dict(filename)
+        if file_config:
+            update_config(file_config, config_dict)
+    else:
+        logging.error(f'Unknown config {filename}, ignored')
+    _set_verbosity(config_dict)
+    config = ConfigMap(config_dict=config_dict)
+    return config
+
+
+def _set_verbosity(config: dict):
+    """Set logging verbosity by the config."""
+    if config.get('debug'):
+        logging.set_verbosity(1)
+    if config.get('log_level'):
+        logging.set_verbosity(config.get('log_level'))
+    logging.info(f'Logging verbosity {logging.get_verbosity()}')
+
+
+def set_config_value(param: str, value: str, config: dict):
+    """Set the config value for the param with the original type."""
+    if param is None:
+        return
+    if isinstance(config, ConfigMap):
+        config = config.get_configs()
+    orig_value = config.get(param)
+    if orig_value is not None:
+        value = get_value_type(value, orig_value)
+    config[param] = value
+
+
+def update_config(new_config: dict, config: dict) -> dict:
+    """Add values from the new_config into config and return the updated dict."""
+    for key, value in new_config.items():
+        set_config_value(key, value, config)
+    return config
+
+
+def get_value_type(value: str, default_value):
+    """Returns value in the type of value_type."""
+    if value is None:
+        return value
+    value_type = type(default_value)
+    if value_type is list:
+        # Convert value to list
+        if isinstance(value, list):
+            return value
+        default_element = ''
+        if len(default_value) > 0:
+            default_element = default_value[0]
+        if isinstance(value, str):
+            value = value.strip()
+            if value:
+                if value[0] == '[':
+                    value = value[1:]
+                if value[-1] == ']':
+                    value = value[:-1]
+        return [
+            get_value_type(v.strip(), default_element)
+            for v in str(value).split(',')
+        ]
+    if value_type is str:
+        return str(value).strip()
+    if value_type is int or value_type is float:
+        return get_numeric_value(value)
+    if value_type is bool:
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, str):
+            match value.lower():
+                case 'true':
+                    return True
+                case 'false':
+                    return False
+                case '':
+                    return False
+        return get_numeric_value(value) > 0
+    if value_type is dict or value_type is OrderedDict:
+        if isinstance(value, str):
+            logging.info(f'Converting {value} to dict')
+            value = value.strip()
+            if value and value[0] == '{':
+                value = _parse_dict(value)
+            elif '=' in value:
+                # Dict is a list of key=value, pairs.
+                pv = {}
+                for prop_value in value.split(','):
+                    prop, val = prop_value.split('=', 1)
+                    prop = prop.strip()
+                    pv[prop] = val.strip()
+                value = pv
+    return value
+
+
+def _parse_dict(dict_str: str) -> dict:
+    """Returns a dict parsed from text string."""
+    try:
+        return ast.literal_eval(dict_str)
+    except (NameError, ValueError) as e:
+        logging.error(f'Unable to parse dict {dict_str}')
+    return {}
diff --git a/tools/statvar_importer/eval_functions.py b/tools/statvar_importer/eval_functions.py
new file mode 100644
index 0000000000..7e3336442e
--- /dev/null
+++ b/tools/statvar_importer/eval_functions.py
@@ -0,0 +1,125 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for eval statements with PropertyValueMapper.
+
+The functions can be invoked within '#Eval' in the pv_map.py.
+For Example, for format values in 'DateTime' column into ISO-8601 format:
+'DateTime': {
+  '#Eval': 'observationDate=format_date("{Data}")',
+ }
+"""
+
+import datetime
+from datetime import datetime
+import os
+import re
+import sys
+
+from absl import logging
+import dateutil
+from dateutil import parser
+from dateutil.relativedelta import relativedelta
+
+# String utility functions
+
+
+def format_date(date_str: str, format_str: str = '%Y-%m-%d') -> str:
+    """Parse the date string and return formated date string.
+
+  Args:
+    date_str: Input date string to be parsed.
+    format_str: output format for date
+
+  Returns:
+    date formatted by the format_str.
+    In case of parse error, returns the original date_str.
+  Raises
+    NameError in case of any exceptions in parsing.
+    This will cause any Eval using it to fail.
+  """
+    try:
+        return dateutil.parser.parse(date_str).strftime(format_str)
+    except dateutil.parser._parser.ParserError:
+        return ''
+
+
+def str_to_camel_case(input_string: str,
+                      strip_re: str = r'[^A-Za-z_0-9]') -> str:
+    """Returns the string in CamelCase without spaces and special characters.
+
+  Example: "Abc-def(HG-123)" -> "AbcDefHG".
+
+  Args:
+    input_string: string to be converted to CamelCase
+    strip_chars: regular expression of characters to be removed.
+
+  Returns:
+    string with non-alpha characters removed and remaining words capitalized.
+  """
+    if not str:
+        return ''
+    if not isinstance(input_string, str):
+        input_string = str(input_string)
+    # Replace any non-alpha characters with space
+    clean_str = re.sub(strip_re, ' ', input_string)
+    clean_str = clean_str.strip()
+    # Split by space and capitalize first letter, preserving any other capitals
+    return ''.join(
+        [w[0].upper() + w[1:] for w in clean_str.split(' ') if len(w) > 0])
+
+
+EVAL_GLOBALS = {
+    # Date time functions
+    'dateutil_parser_parse': dateutil.parser.parse,
+    'format_date': format_date,
+    'datetime': datetime,
+    'datetime_strptime': datetime.strptime,
+    'relativedelta': relativedelta,
+    # String functions
+    'str_to_camel_case': str_to_camel_case,
+    # Regex functions
+    're': re,
+    're_sub': re.sub,
+}
+
+
+def evaluate_statement(eval_str: str,
+                       variables: dict = {},
+                       functions: dict = EVAL_GLOBALS) -> (str, str):
+    """Returns the tuple: (variable, result) after evaluating statement in eval.
+
+   Args:
+     eval_str: string with statement to be evaluated of the form:
+       'variable=statement' if the variable is not specified, an empty string is
+       retured as variable.
+      variables: dictionary of variables and values to be used in statement.
+      functions: dictionary of global functoins that can be invoked within
+        statement.
+
+  Returns:
+      tuple of the (variable , result) after evaluating the statement.
+      in case of exception during eval, None is returned as result
+  """
+    variable = ''
+    statement = eval_str
+    if '=' in eval_str:
+        variable, statement = eval_str.split('=', 1)
+    variable = variable.strip()
+    try:
+        result = eval(statement, functions, variables)
+    except (SyntaxError, NameError, ValueError, TypeError) as e:
+        logging.debug(
+            f'Failed to evaluate: {variable}={statement}, {e} in {variables}')
+        result = None
+    return (variable, result)
diff --git a/tools/statvar_importer/eval_functions_test.py b/tools/statvar_importer/eval_functions_test.py
new file mode 100644
index 0000000000..2e4c442c4c
--- /dev/null
+++ b/tools/statvar_importer/eval_functions_test.py
@@ -0,0 +1,90 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for eval_functions.py."""
+
+import os
+import sys
+import tempfile
+import unittest
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+sys.path.append(os.path.dirname(_SCRIPT_DIR))
+
+import eval_functions
+
+# module_dir_ is the path to where this test is running from.
+_module_dir_ = os.path.dirname(__file__)
+
+
+class TestEvalFunctions(unittest.TestCase):
+
+    def test_evaluate_statement(self):
+        self.assertEqual(
+            ('num', 3),
+            eval_functions.evaluate_statement('num=1+Number', {'Number': 2}),
+        )
+        self.assertEqual(
+            ('', 4), eval_functions.evaluate_statement('2*Number',
+                                                       {'Number': 2}))
+        # Verify None is returned on error in statement
+        self.assertEqual(
+            ('name', None),
+            eval_functions.evaluate_statement(
+                'name=1+Data',
+                {'Data': '2'}  # string should raise TypeError
+            ),
+        )
+        # Missing variable value for Data raises NameError
+        self.assertEqual(('name', None),
+                         eval_functions.evaluate_statement('name=1+Data'))
+
+    def test_format_date(self):
+        self.assertEqual('2023-01-31',
+                         eval_functions.format_date('Jan 31, 2023'))
+        self.assertEqual(
+            ('month', '2022-01'),
+            eval_functions.evaluate_statement(
+                'month=format_date(Data, "%Y-%m")', {'Data': '2022, Jan 1st'}),
+        )
+        self.assertEqual(
+            ('', '2022-12-31'),
+            eval_functions.evaluate_statement(
+                'format_date(Data)', {'Data': 'Dec 31st, 2022, 10:00am'}),
+        )
+        self.assertEqual(
+            ('', ''),
+            eval_functions.evaluate_statement('format_date("SunMonTue")'),
+        )
+
+    def test_str_to_camel_case(self):
+        self.assertEqual('CamelCase123',
+                         eval_functions.str_to_camel_case(' camel-case 123 '))
+        self.assertEqual(
+            ('name', '10MyDCID'),
+            eval_functions.evaluate_statement('name=str_to_camel_case(Data)',
+                                              {'Data': '1.0 my DCID'}),
+        )
+        self.assertEqual(
+            ('', 'SnakeCaseString'),
+            eval_functions.evaluate_statement('str_to_camel_case(Data)',
+                                              {'Data': 'snake(case.) string'}),
+        )
+        self.assertEqual(
+            ('', 'String_Value1'),
+            eval_functions.evaluate_statement(
+                'str_to_camel_case(Data, r"[^A-Za-z0-9_]")',
+                {'Data': 'string_ value(1)'},
+            ),
+        )
diff --git a/tools/statvar_importer/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py
new file mode 100644
index 0000000000..51bad68fdf
--- /dev/null
+++ b/tools/statvar_importer/property_value_mapper.py
@@ -0,0 +1,614 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility class to store property:value mappings for data strings."""
+
+import csv
+import os
+import re
+import sys
+
+from absl import app
+from absl import flags
+from absl import logging
+from collections import OrderedDict
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+sys.path.append(os.path.dirname(_SCRIPT_DIR))
+sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR)))
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util'))
+
+import config_flags
+import eval_functions
+import file_util
+
+import property_value_utils as pv_utils
+
+from config_map import ConfigMap, read_py_dict_from_file
+from counters import Counters, CounterOptions
+
+
+class PropertyValueMapper:
+    """Class to map strings to set of property values.
+
+  Supports multiple maps with a namespace or context string. Stores string to
+  property:value maps as a dictionary: _pv_map = {
+
+    'GLOBAL': {
+      '<input-data-string1>': {
+        '<prop1>': '<value1>'
+        '<prop2>': '<value2>'
+        ...
+      },
+      ...
+    },
+    '<namespace>' : {
+      '<input-data-string2>': {
+        '<prop3>': '<value3>'
+        ...
+      },
+      ...
+    },
+  }
+
+  The first level keys in _pv_map are namespaces that are column-headers or
+  'GLOBAL'.
+  When looking up PVs for an input string, such as a column header or a cell
+  value,
+  first the namespace column-header is tried.
+  If there are no values then other namespacs such as 'GLOBAL are tried.
+
+  <value> within the PV can have a reference to another property.
+  Such reference are replaced with that property's value after
+  all PVs for a data cell have been collected.
+
+  The references are indicated with the syntax '{Variable}' or '@Variable'.
+  where 'Variable' is expected to be another property in the cell's PVs.
+
+  Internal properties that require special processing begin with '#', such as:
+  '#Regex': refers to a regular expression with names match groups
+      to be applied on a cell value
+  '#Format': a format string to be processed with other parameters
+  '#Eval': a python statement to be evaluated. It could have some computations
+    of the form <prop>=<expr> where the '<expr>' is evaluated and
+    assigned to property <prop> or to 'Data'.
+
+  The cell value is mapped to the following default properties:
+  'Data': the string value in the cell
+  'Number': the numeric value if the cell is a number.
+  """
+
+    def __init__(
+        self,
+        pv_map_files: list = [],
+        config_dict: dict = None,
+        counters_dict: dict = None,
+    ):
+        self._config = ConfigMap(config_dict=config_dict)
+        self._counters = Counters(
+            counters_dict=counters_dict,
+            options=CounterOptions(debug=self._config.get('debug', False)),
+        )
+        # Map from a namespace to dictionary of string-> { p:v}
+        self._pv_map = OrderedDict({'GLOBAL': {}})
+        self._num_pv_map_keys = 0
+        self._max_words_in_keys = 0
+        for filename in pv_map_files:
+            namespace = 'GLOBAL'
+            if not file_util.file_get_matching(filename):
+                if ':' in filename:
+                    namespace, filename = filename.split(':', 1)
+            self.load_pvs_from_file(filename, namespace)
+        logging.level_debug() and logging.debug(
+            f'Loaded PV map {self._pv_map} with max words {self._max_words_in_keys}'
+        )
+
+    def load_pvs_from_file(self, filename: str, namespace: str = 'GLOBAL'):
+        """Loads a map of the form 'string -> { P: V }' from a file.
+
+    File is a python dictionary or a JSON file with python equivalents such as
+    True(true), False(false), None(null).
+
+    Args:
+      filename: file containing the dictionary of string to dictionary of PVs
+      namespace: the namespace key for the dictionary to be loaded against. the
+        namespace is the first level key in the _pv_map.
+    """
+        # Append new PVs to existing map.
+        pv_map_input = {}
+        if file_util.file_is_csv(filename):
+            # Load rows into a dict of prop,value
+            # if the first col is a config key, next column is its value
+            logging.info(
+                f'Loading PV maps for {namespace} from csv file: {filename}')
+            with file_util.FileIO(filename) as csvfile:
+                csv_reader = csv.reader(csvfile,
+                                        skipinitialspace=True,
+                                        escapechar='\\')
+                for row in csv_reader:
+                    # Drop trailing empty columns in the row
+                    last_col = len(row) - 1
+                    while last_col >= 0 and row[last_col].strip() == '':
+                        last_col -= 1
+                    row = row[:last_col + 1]
+                    if not row:
+                        continue
+                    key = row[0].strip()
+                    if key in self._config.get_configs():
+                        # Add value to the config with same type as original.
+                        value = ','.join(row[1:])
+                        config_flags.set_config_value(key, value, self._config)
+                    else:
+                        # Row is a pv map
+                        pvs_list = row[1:]
+                        if len(pvs_list) == 1:
+                            # PVs list has no property, just a value.
+                            # Use the namespace as the property
+                            pvs_list = [namespace]
+                            pvs_list.append(row[1])
+                        if len(pvs_list) % 2 != 0:
+                            raise RuntimeError(
+                                f'Invalid list of property value: {row} in {filename}'
+                            )
+                        # Get property,values from the columns
+                        pvs = {}
+                        for i in range(0, len(pvs_list), 2):
+                            prop = pvs_list[i].strip()
+                            if not prop:
+                                continue
+                            value = pvs_list[i + 1].strip()
+                            if value == '""':
+                                value = ''
+                            # Remove extra quotes around schema values.
+                            # if value and value[0] == '"' and value[-1] == '"':
+                            #  value = value[1:-1].strip()
+                            if value and value[0] != '[' and prop[0] != '#':
+                                # Add quotes around text strings
+                                # with spaces without commas.
+                                # if re.search('[^,] +', value):
+                                #  value = f'"{value}"'
+                                if value[0] == "'" and value[-1] == "'":
+                                    # Replace single quote with double quotes
+                                    # To distinguish quote as delimiter vs value in CSVs
+                                    # single quote is used instead of double quote in CSV values.
+                                    value[0] = '"'
+                                    value[-1] = '"'
+                            #pvs[prop] = value
+                            normalize = True
+                            if '#' in prop or '=' in value:
+                                # Value is a formula. e value as a string.
+                                normalize = False
+                            pv_utils.add_key_value(prop,
+                                                   value,
+                                                   pvs,
+                                                   self._config.get(
+                                                       'multi_value_properties',
+                                                       {}),
+                                                   normalize=normalize)
+                        pv_map_input[key] = pvs
+        else:
+            logging.info(
+                f'Loading PV maps for {namespace} from dictionary file: {filename}'
+            )
+            pv_map_input = read_py_dict_from_file(filename)
+        self.load_pvs_dict(pv_map_input, namespace)
+
+    def load_pvs_dict(self, pv_map_input: dict, namespace: str = 'GLOBAL'):
+        if namespace not in self._pv_map:
+            self._pv_map[namespace] = {}
+        pv_map = self._pv_map[namespace]
+        word_delimiter = self._config.get('word_delimiter', ' ')
+        num_keys_added = 0
+        for key, pvs_input in pv_map_input.items():
+            if key not in pv_map:
+                pv_map[key] = {}
+            pvs_dict = pv_map[key]
+            if isinstance(pvs_input, str):
+                pvs_input = {namespace: pvs_input}
+            for p, v in pvs_input.items():
+                num_keys_added += 1
+                pv_utils.add_key_value(
+                    p,
+                    v,
+                    pvs_dict,
+                    self._config.get('multi_value_properties', {}),
+                )
+            # Track the max number of words in any of the keys.
+            # This is used when splitting input-string for lookups.
+            num_words_key = len(pv_utils.get_words(key, word_delimiter))
+            self._max_words_in_keys = max(self._max_words_in_keys,
+                                          num_words_key)
+            logging.level_debug() and logging.log(
+                2, f'Setting PVMap[{key}] = {pvs_dict}')
+
+        self._num_pv_map_keys += num_keys_added
+        logging.info(
+            f'Loaded {num_keys_added} property-value mappings for "{namespace}"'
+        )
+        logging.level_debug() and logging.debug(
+            f'Loaded pv map {namespace}:{pv_map_input}')
+
+    def get_pv_map(self) -> dict:
+        """Returns the dictionary mapping input-strings to property:values."""
+        return self._pv_map
+
+    def process_pvs_for_data(self, key: str, pvs: dict) -> bool:
+        """Processes property:value and returns true if processed successfully.
+
+    Processes values for actionable props such as '#Regex', '#Eval', '#Format'.
+    Args: pvs (input/output) dictionary of property:values Properties such as
+    '#Regex', '#Eval', '#Format' are processed and resulting properties are
+    updated into pvs.
+
+    Returns:
+       True if any property:values were processed and pvs dict was updated.
+    """
+        logging.level_debug() and logging.log(
+            2, f'Processing data PVs:{key}:{pvs}')
+        data_key = self._config.get('data_key', 'Data')
+        data = pvs.get(data_key, key)
+        is_modified = False
+
+        # Process regular expression and add named group matches to the PV.
+        # Regex PV is of the form: '#Regex': '(?P<Start>[0-9]+) *- *(?P<End>[0-9])'
+        # Parses 'Data': '10 - 20' to generate PVs:
+        # { 'Start': '10', 'End': '20' }
+        regex_key = self._config.get('regex_key', '#Regex')
+        if regex_key in pvs and data:
+            re_pattern = pvs[regex_key]
+            re_matches = re.finditer(re_pattern, data)
+            regex_pvs = {}
+            for match in re_matches:
+                regex_pvs.update(match.groupdict())
+            logging.level_debug() and logging.log(
+                2,
+                f'Processed regex: {re_pattern} on {key}:{data} to get {regex_pvs}'
+            )
+            if regex_pvs:
+                self._counters.add_counter('processed-regex', 1, re_pattern)
+                pv_utils.pvs_update(
+                    regex_pvs, pvs,
+                    self._config.get('multi_value_properties', {}))
+                pvs.pop(regex_key)
+                is_modified = True
+
+        # Format the data substituting properties with values.
+        format_key = self._config.get('format_key', '#Format')
+        if format_key in pvs:
+            format_str = pvs[format_key]
+            (format_prop, strf) = _get_variable_expr(format_str, data_key)
+            try:
+                format_data = strf.format(**pvs)
+                logging.level_debug() and logging.log(
+                    2,
+                    f'Processed format {format_prop}={strf} on {key}:{data} to get'
+                    f' {format_data}')
+            except (KeyError, ValueError) as e:
+                format_data = format_str
+                self._counters.add_counter('error-process-format', 1,
+                                           format_str)
+                logging.level_debug() and logging.log(
+                    2,
+                    f'Failed to format {format_prop}={strf} on {key}:{data} with'
+                    f' {pvs}, {e}')
+            if format_prop != data_key and format_data != format_str:
+                pvs[format_prop] = format_data
+                self._counters.add_counter('processed-format', 1, format_str)
+                pvs.pop(format_key)
+                is_modified = True
+
+        # Evaluate the expression properties as local variables.
+        eval_key = self._config.get('eval_key', '#Eval')
+        if eval_key in pvs:
+            eval_str = pvs[eval_key]
+            eval_prop, eval_data = eval_functions.evaluate_statement(
+                eval_str,
+                pvs,
+                self._config.get('eval_globals', eval_functions.EVAL_GLOBALS),
+            )
+            logging.level_debug() and logging.log(
+                2,
+                f'Processed eval {eval_str} with {pvs} to get {eval_prop}:{eval_data}'
+            )
+            if not eval_prop:
+                eval_prop = data_key
+            if eval_data and eval_data != eval_str:
+                pvs[eval_prop] = eval_data
+                self._counters.add_counter('processed-eval', 1, eval_str)
+                pvs.pop(eval_key)
+                is_modified = True
+        logging.level_debug() and logging.log(
+            2, f'Processed data PVs:{is_modified}:{key}:{pvs}')
+        return is_modified
+
+    def get_pvs_for_key(self, key: str, namespace: str = 'GLOBAL') -> dict:
+        """Return a dict of property-values that are mapped to the given key
+    within the dictionary for the namespace.
+
+    Args:
+      key: input string to be looked up
+      namespace: the top level dictionary key to get the map within which
+        input-string is looked up.
+
+    Returns:
+      dictionary of property:values for the input string.
+    """
+        pvs = None
+        logging.level_debug() and logging.log(
+            3, f'Search PVs for {namespace}:{key}')
+        if namespace in self._pv_map:
+            pvs = self._pv_map[namespace].get(key, None)
+        else:
+            # Check if key is unique and exists in any other map.
+            dicts_with_key = []
+            pvs = {}
+            namespaces = self._config.get('default_pv_maps', ['GLOBAL'])
+            for namespace in namespaces:
+                logging.level_debug() and logging.log(
+                    3, f'Search PVs for {namespace}:{key}')
+                if namespace in self._pv_map.keys():
+                    pv_map = self._pv_map[namespace]
+                    if key in pv_map:
+                        dicts_with_key.append(namespace)
+                        pv_utils.pvs_update(
+                            pv_map[key], pvs,
+                            self._config.get('multi_value_properties', {}))
+            if len(dicts_with_key) > 1:
+                logging.warning(
+                    f'Duplicate key {key} in property maps: {dicts_with_key}')
+                self._counters.add_counter(
+                    f'warning-multiple-property-key',
+                    1,
+                    f'{key}:' + ','.join(dicts_with_key),
+                )
+        if not pvs:
+            logging.level_debug() and logging.log(
+                3, f'Missing key {key} in property maps')
+            self._counters.add_counter(f'warning-missing-property-key', 1, key)
+            return pvs
+        logging.level_debug() and logging.debug(f'Got PVs for {key}:{pvs}')
+        return pvs
+
+    def get_pvs_for_key_variants(self,
+                                 key: str,
+                                 namespace: str = 'GLOBAL') -> list:
+        """Return a dict of property-values that are mapped to the given key
+     or its variantes with case lower case.
+    Args:
+      key: input string to be looked up
+      namespace: the top level dictionary key to get the map within which
+        input-string is looked up.
+
+    Returns:
+      a list of dictionary of property:values for the input string.
+    """
+        if not key:
+            return None
+        pvs = self.get_pvs_for_key(key, namespace)
+        if not pvs:
+            # Check if GLOBAL map has key namespace:column-key
+            pvs = self.get_pvs_for_key(f'{namespace}:{key}')
+        if not pvs:
+            pvs = self.get_pvs_for_key(key.lower(), namespace)
+        if pvs:
+            pvs_list = [pvs]
+            pvs_list.append({self._config.get('pv_lookup_key', 'Key'): key})
+            return pvs_list
+        # Check for keys with extra characters removed.
+        key_filtered = re.sub('[^A-Za-z0-9_%$-]+', ' ', key).strip()
+        if key_filtered != key:
+            return self.get_pvs_for_key_variants(key_filtered, namespace)
+        return None
+
+    def _is_key_in_value(self, key: str, value: str) -> bool:
+        """Returns True if key is a substring of the value string.
+
+    Only substrings separated by the word boundary are considered.
+    """
+        if self._config.get('match_substring_word_boundary', True):
+            # Match substring around word boundaries.
+            while value:
+                pos = value.find(key)
+                if pos < 0:
+                    return False
+                if (pos == 0 or not value[pos - 1].isalpha()) and (
+                        pos + len(key) <= len(value) or
+                        not value[pos + len(key)].isalpha()):
+                    return True
+                value = value[pos:]
+            return False
+            # key_pat = f'\\b{key}\\b'
+            # try:
+            #  if re.search(key_pat, value, flags=re.IGNORECASE):
+            #    return True
+            #  else:
+            #    return False
+            # except re.error as e:
+            #    logging.error(
+            #        f'Failed re.search({key_pat}, {value}) with exception: {e}'
+            #    )
+            #    return False
+
+        # Simple substring without word boundary checks.
+        if key.lower() in value.lower():
+            return True
+        return False
+
+    def get_pvs_for_key_substring(self,
+                                  value: str,
+                                  namespace: str = 'GLOBAL') -> dict:
+        """Return a dict of property-values for any key is a substring of value
+
+    Args:
+      value: input string to be mapped to property:values
+      namespace: column header or context for the value string used as the key
+        for the first level dictionary in the pv_map.
+
+    Returns:
+      List of dictionary of property:values that apply to the input string
+      after collecting all PVs for any key that is a substring of the value.
+    """
+        # Get a list of namespaces to lookup.
+        # If none given, lookup in all namespaces.
+        namespaces = []
+        if namespace and namespace in self._pv_map:
+            namespaces.append(namespace)
+        else:
+            namespaces = list(self._pv_map.keys())
+        pvs_list = []
+        keys_list = []
+        for n in namespaces:
+            # Lookup keys from shortest to longest.
+            # Caller will merge PVs in the reverse order.
+            pv_map = self._pv_map[n]
+            sorted_keys = sorted(pv_map.keys(), key=len, reverse=True)
+            for key in sorted_keys:
+                if self._is_key_in_value(key, value):
+                    pvs_list.append(pv_map[key])
+                    keys_list.append(key)
+                    logging.level_debug() and logging.log(
+                        3, f'Got PVs for {key} in {value}: {pvs_list}')
+                    value = value.replace(key, ' ')
+        logging.level_debug() and logging.log(
+            2,
+            f'Returning pvs for substrings of {value} from {keys_list}:{pvs_list}'
+        )
+        return pvs_list
+
+    def get_all_pvs_for_value(self,
+                              value: str,
+                              namespace: str = 'GLOBAL',
+                              max_fragment_size: int = None) -> list:
+        """Return a list of property:value dictionaries for an input string.
+
+    Args:
+      value: input string to be mapped to property:values
+      namespace: context for the input string such as the column header.
+      max_fragment_size: the maximum number of words into which value can be
+        fragmented when looking for matching keys in the pv_map.
+
+    Returns:
+      a list of dictionary of property:values.
+    """
+        logging.level_debug() and logging.log(
+            1, f'Looking up PVs for {namespace}:{value}')
+        pvs = self.get_pvs_for_key_variants(value, namespace)
+        if pvs:
+            return pvs
+        # Split the value into n-grams and lookup PVs for each fragment.
+        word_delimiter = self._config.get('word_delimiter', ' ')
+        if not word_delimiter:
+            # Splitting of words is disabled. Don't match substrings.
+            return None
+        word_joiner = pv_utils.get_delimiter_char(word_delimiter)
+        words = pv_utils.get_words(value, word_delimiter)
+        if len(words) <= 1:
+            return None
+        max_fragment_words = len(words) - 1
+        if not max_fragment_size:
+            max_fragment_size = self._max_words_in_keys
+        max_fragment_words = min(max_fragment_words, max_fragment_size)
+
+        num_grams = (len(words) - max_fragment_size)**2
+        if self._num_pv_map_keys < num_grams:
+            # Fewer keys than n-grams in input.
+            # Get PVs for keys in pv_map that are a substring of the input value.
+            return self.get_pvs_for_key_substring(value, namespace)
+        # Fewer n-grams than number of keys in map.
+        # Check if any input n-gram matches a key.
+        logging.level_debug() and logging.log(
+            3, f'Looking up PVs for {max_fragment_words} words in {words}')
+        for num_words in range(max_fragment_words, 0, -1):
+            for start_index in range(0, len(words) - num_words + 1):
+                sub_value = word_joiner.join(words[start_index:start_index +
+                                                   num_words])
+                sub_pvs = self.get_pvs_for_key_variants(sub_value, namespace)
+                if sub_pvs:
+                    # Got PVs for a fragment.
+                    # Also lookup remaining fragments before and after this.
+                    pvs_list = []
+                    before_value = word_delimiter.join(words[0:start_index])
+                    after_value = word_delimiter.join(words[start_index +
+                                                            num_words:])
+                    logging.level_debug() and logging.log(
+                        3,
+                        f'Got PVs for {start_index}:{num_words} in'
+                        f' {words}:{sub_value}:{sub_pvs}, lookup pvs for {before_value},'
+                        f' {after_value}',
+                    )
+                    before_pvs = self.get_all_pvs_for_value(
+                        # before_value, namespace, max_fragment_size=None)
+                        before_value,
+                        namespace,
+                        max_fragment_size=num_words,
+                    )
+                    after_pvs = self.get_all_pvs_for_value(
+                        # after_value, namespace, max_fragment_size=None)
+                        after_value,
+                        namespace,
+                        max_fragment_size=num_words,
+                    )
+                    if before_pvs:
+                        pvs_list.extend(before_pvs)
+                    pvs_list.extend(sub_pvs)
+                    if after_pvs:
+                        pvs_list.extend(after_pvs)
+                    logging.level_debug() and logging.log(
+                        2, f'Got PVs for fragments {before_value}:{before_pvs},'
+                        f' {sub_value}:{sub_pvs}, {after_value}:{after_pvs}')
+                    return pvs_list
+        return None
+
+
+# Local utility functions
+def _get_variable_expr(stmt: str, default_var: str = 'Data') -> (str, str):
+    """Parses a statement of the form <variable>=<expr> and returns variable, expr."""
+    if '=' in stmt:
+        (var, expr) = stmt.split('=', 1)
+        return (var.strip(), expr)
+    return (default_var, stmt)
+
+
+# PVMap utility functions
+def load_pv_map(file: str) -> dict:
+    """Returns a PV map loaded from a file."""
+    pvmap = PropertyValueMapper()
+    for file in file_util.file_get_matching(file):
+        pvmap.load_pvs_from_file(file)
+    pvs = pvmap.get_pv_map()
+    # Return the pvmap for the first namespace
+    if pvs:
+        return pvs[list(pvs.keys())[0]]
+    return {}
+
+
+def write_pv_map(pvmap: dict, file: str) -> str:
+    """Write the PV map into a file."""
+    if file_util.file_is_csv(file):
+        # Write pvmap as csv file with rows as : key,prop1,value1,prop2,value2
+        with file_util.FileIO(file, 'w') as csv_file:
+            csv_writer = csv.writer(csv_file)
+            # Set CSV header as 'key, prop, value'
+            csv_writer.writerow(['key', 'property', 'value'])
+            # Write each pvmap node as a row.
+            for key, pvs in pvmap.items():
+                row = [key]
+                for prop, value in pvs.items():
+                    row.append(prop)
+                    row.append(value)
+                csv_writer.writerow(row)
+    else:
+        file_util.file_write_py_dict(pvmap, file)
+    logging.info(f'Wrote {len(pvmap)} rows of PVs into {file}')
diff --git a/tools/statvar_importer/property_value_mapper_test.py b/tools/statvar_importer/property_value_mapper_test.py
new file mode 100644
index 0000000000..1cc441f667
--- /dev/null
+++ b/tools/statvar_importer/property_value_mapper_test.py
@@ -0,0 +1,112 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for property_value_mapper.py."""
+
+import unittest
+
+import os
+import sys
+
+from absl import app
+from absl import logging
+from property_value_mapper import PropertyValueMapper
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+sys.path.append(os.path.dirname(_SCRIPT_DIR))
+sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR)))
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util'))
+
+
+class PropertyValueMapperTest(unittest.TestCase):
+
+    def test_load_pvmap(self):
+        pv_mapper = PropertyValueMapper(pv_map_files=[
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py')
+        ])
+
+        # Verify PVmap has key 'GLOBAL'
+        pv_map = pv_mapper.get_pv_map()
+        self.assertTrue('GLOBAL' in pv_map)
+        self.assertTrue(len(pv_map['GLOBAL']) > 0)
+
+        # Lookup PV Map for known key
+        pvs = pv_mapper.get_pvs_for_key('Males')
+        self.assertEqual(pvs, {'gender': 'dcs:Male'})
+
+        # Lookup PV Map for case mismatched key fails
+        pvs = pv_mapper.get_pvs_for_key('males')
+        self.assertEqual(pvs, None)
+
+        # Load PVMap for a different namespace: Variable
+        pv_mapper.load_pvs_from_file(
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'),
+            'Variable')
+        self.assertTrue('Variable' in pv_mapper.get_pv_map())
+
+        # Lookup PVMap for 'Variable' column
+        pvs = pv_mapper.get_pvs_for_key('total', 'Variable')
+        self.assertEqual(pvs, {'populationType': 'dcs:Person'})
+        # Verify keys from Variable are not retruned for GLOBAL
+        pvs = pv_mapper.get_pvs_for_key('total')
+        self.assertEqual(pvs, None)
+
+    def test_pvmap_get_all_pvs(self):
+        pv_mapper = PropertyValueMapper(pv_map_files=[
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py'),
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'),
+        ])
+        self.assertEqual(len(pv_mapper.get_pv_map()), 1)
+
+        # Verify matches for words in long key not in pv_map
+        pvs = pv_mapper.get_all_pvs_for_value('Total Males')
+        expected_pvs = [
+            # PVs for 'total'
+            {
+                'populationType': 'dcs:Person'
+            },
+            {
+                'Key': 'Total'
+            },
+            # PVs for Male
+            {
+                'gender': 'dcs:Male'
+            },
+            {
+                'Key': 'Males'
+            }
+        ]
+        self.assertEqual(pvs, expected_pvs)
+
+    def test_process_pvs(self):
+        pv_mapper = PropertyValueMapper(pv_map_files=[
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.py'),
+            os.path.join(_SCRIPT_DIR, 'test_data/sample_pv_map.csv'),
+        ])
+
+        pvs = pv_mapper.get_pvs_for_key('Person Age')
+        self.assertEqual(
+            pvs, {
+                '#Regex': '(?P<StartAge>[0-9]+)-(?P<EndAge>[0-9]+)',
+                'age': 'dcid:{@StartAge}To{@EndAge}Years'
+            })
+        # Verify processing of regex for range
+        self.assertTrue(pv_mapper.process_pvs_for_data('10-20', pvs))
+        self.assertEqual(
+            pvs, {
+                'EndAge': '20',
+                'StartAge': '10',
+                'age': 'dcid:{@StartAge}To{@EndAge}Years'
+            })
diff --git a/tools/statvar_importer/property_value_utils.py b/tools/statvar_importer/property_value_utils.py
new file mode 100644
index 0000000000..a08bd0e65a
--- /dev/null
+++ b/tools/statvar_importer/property_value_utils.py
@@ -0,0 +1,156 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for proerty:values."""
+
+import os
+import re
+import sys
+
+from typing import Union
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_DIR)
+sys.path.append(os.path.dirname(_SCRIPT_DIR))
+sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR)))
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util'))
+
+from mcf_file_util import get_value_list, add_pv_to_node, strip_namespace
+
+
+def is_valid_property(prop: str, schemaless: bool = False) -> bool:
+    """Returns True if the property begins with a letter, lowercase.
+
+  If schemaless is true, property can begin with uppercase as well.
+  """
+    if prop and isinstance(prop, str) and prop[0].isalpha():
+        if schemaless or prop[0].islower():
+            return True
+    return False
+
+
+def is_valid_value(value: str) -> bool:
+    """Returns True if the value is valid without any references."""
+    if value is None:
+        return False
+    if isinstance(value, str):
+        # Check there are no unresolved references.
+        if not value or value == '""':
+            return False
+        if '@' in value:
+            # Quoted strings can have @<2-letter-lang> suffix.
+            if not re.search('@[a-z]{2}"$', value):
+                return False
+        if '{' in value and '}' in value:
+            return False
+    return True
+
+
+def is_schema_node(value: str) -> bool:
+    """Returns True if the value is a schema node reference."""
+    if not value or not isinstance(value, str):
+        return False
+    if not value[0].isalpha() and value[0] != '[':
+        # Numbers or quoted strings are not schema nodes.
+        return False
+    # Check if string has any non alpha or non numeric codes
+    non_alnum_chars = [
+        c for c in strip_namespace(value)
+        if not c.isalnum() and c not in ['_', '/', '[', ']', '.']
+    ]
+    if non_alnum_chars:
+        return False
+    return True
+
+
+def has_namespace(value: str) -> bool:
+    """Returns True if the value has a namespace of letters followed by ':'."""
+    if not value or not isinstance(value, str):
+        return False
+    len_value = len(value)
+    pos = 0
+    while pos < len_value:
+        if not value[pos].isalpha():
+            break
+        pos += 1
+    if pos < len_value and value[pos] == ':':
+        return True
+    return False
+
+
+def add_key_value(
+    key: str,
+    value: str,
+    pvs: dict,
+    multi_value_keys: set = {},
+    overwrite: bool = True,
+    normalize: bool = True,
+) -> dict:
+    """Adds a key:value to the dict.
+
+  If the key already exists, adds value to a list if key is a multi_value key,
+  else replaces the value if overwrite is True.
+  """
+    append_value = False
+    if key in multi_value_keys:
+        append_value = True
+    if not append_value and not overwrite and key in pvs:
+        # Do not add value if one exists and overwrite and append is disabled.
+        return pvs
+    return add_pv_to_node(key,
+                          value,
+                          pvs,
+                          append_value=append_value,
+                          normalize=normalize)
+
+
+def get_value_as_list(value: str) -> Union[str, list]:
+    """Returns the value as a list or string."""
+    if isinstance(value, list):
+        return value
+    if isinstance(value, str) and value:
+        if "," in value:
+            # Get a list of unique values
+            values = set()
+            values.update(get_value_list(value))
+            value_list = list(values)
+            if len(value_list) == 1:
+                return value_list[0]
+            return value_list
+    return value
+
+
+def pvs_update(new_pvs: dict, pvs: dict, multi_value_keys: set = {}) -> dict:
+    """Add the key:value pairs from the new_pvs into the pvs dictionary."""
+    for prop, value in new_pvs.items():
+        add_key_value(prop, value, pvs, multi_value_keys)
+    return pvs
+
+
+def get_words(value: str, word_delimiter: str) -> list:
+    """Returns the list of non-empty words separated by the delimiter."""
+    return [w for w in re.split(word_delimiter, value) if w]
+
+
+def get_delimiter_char(re_delimiter: str) -> str:
+    """Returns a single delimiter character that can be used to join words
+
+  from the first character in the delimiter regex.
+  """
+    if re_delimiter:
+        if '|' in re_delimiter:
+            return re_delimiter.split('|')[0]
+        if re_delimiter[0] == '[':
+            return re_delimiter[1]
+    return ' '
diff --git a/tools/statvar_importer/test_data/sample_pv_map.csv b/tools/statvar_importer/test_data/sample_pv_map.csv
new file mode 100644
index 0000000000..109debe6a0
--- /dev/null
+++ b/tools/statvar_importer/test_data/sample_pv_map.csv
@@ -0,0 +1,5 @@
+key,property,value
+#Sample PVMap for test,lines begining with #,ignored
+total,populationType,dcs:Person,
+woman,gender,dcs:Female,age,[18 - Years]
+man,gender,dcs:Male,age,[18 - Years]
diff --git a/tools/statvar_importer/test_data/sample_pv_map.py b/tools/statvar_importer/test_data/sample_pv_map.py
new file mode 100644
index 0000000000..1930c0cf44
--- /dev/null
+++ b/tools/statvar_importer/test_data/sample_pv_map.py
@@ -0,0 +1,71 @@
+{
+    # Sample column map.
+    # Key is a substring of a row or column header.
+    # Value is a dictionary of property-value tuples to be applied to
+    # all elements in the row or column.
+    # If keys are overlapping, the longest key as a substring of a column is used.
+    # A column name can map to multiple keys for different parts of the string
+    # and all property-values for matching keys will be applied.
+    #
+    # Values can have references in the syntax "{variable}".
+    # The variable is replaced with the value from the final set of PVs.
+    #
+    # There are special references:
+    # {Number}: refers to the numeric value in a cell.
+    # {Data}: refers to other values in a cell that is not mapped to any PVs.
+    # <Caps><string>: Use properties starting with a Capital letter to create
+    # local variables that are not emitted in the final output, but are place
+    # holders for replacements.
+
+    # Columns with StatVarObservations should map "value" to "@Number".
+
+    # Place
+    # Applied to all data values in the row.
+    "Fips Code": {
+        "observationAbout": "dcid:geoId/{@Number}"
+    },
+
+    # Time of observation
+    # Applied to all data values in the row.
+    "Year": {
+        "observationDate": "@Number",
+    },
+
+    # Extract age bucket from a range of values.
+    "Person Age": {
+        "#Regex": "(?P<StartAge>[0-9]+)-(?P<EndAge>[0-9]+)",
+        "age": "dcid:{@StartAge}To{@EndAge}Years",
+    },
+
+    # Race: Mapping for values in Column Person Race
+    "WH": {
+        "race": "dcs:WhiteAlone",
+    },
+    "A-PI": {
+        "race": "dcs:AsianOrPacificIslander",
+    },
+
+    # Population count observations fom column: "Total Persons".
+    # key can be normalized to lower case as well.
+    "total persons": {
+        "value": "@Number",
+        "populationType": "dcs:Person",
+        "measuredProperty": "dcs:count",
+    },
+
+    # Another observation for column: Fraction of population
+    "fraction": {
+        "populationType": "dcs:Person",
+        # "measuredProperty" : "dcs:count", # Is the default value for SVObs.
+        "measurementDenominator": "dcid:Count_Person",
+        "value": "@Number",
+    },
+
+    # Extract PVs from section headers
+    'Males': {
+        'gender': "dcs:Male",
+    },
+    'Females': {
+        'gender': "dcs:Female",
+    },
+}
diff --git a/util/config_map.py b/util/config_map.py
index 1ea640107e..dc75566d0d 100644
--- a/util/config_map.py
+++ b/util/config_map.py
@@ -55,28 +55,32 @@
 '''
 
 import ast
+from collections import OrderedDict
 import collections.abc
 import pprint
 import sys
+from typing import Union
 
 from absl import logging
-from collections import OrderedDict
-from typing import Union
+import file_util
 
 
 class ConfigMap:
-    '''Class to store config mapping of named parameters to values as a dictionary.'''
-
-    def __init__(self,
-                 config_dict: dict = None,
-                 filename: str = None,
-                 config_string: str = None):
-        '''Create a Config Map object.
-        Args:
-          config_dict: dictionary with key:values to be loaded into the config map.
-          filename: override the dictionary with key:values from the file.
-          config_string: string of dictionary parameters to override key:values.
-        '''
+    """Class to store config mapping of named parameters to values as a dictionary."""
+
+    def __init__(
+        self,
+        config_dict: dict = None,
+        filename: str = None,
+        config_string: str = None,
+    ):
+        """Create a Config Map object.
+
+    Args:
+      config_dict: dictionary with key:values to be loaded into the config map.
+      filename: override the dictionary with key:values from the file.
+      config_string: string of dictionary parameters to override key:values.
+    """
         self._config_dict = dict()
         # Add configs from input args.
         if config_dict:
@@ -89,181 +93,191 @@ def __init__(self,
         logging.debug(f'Loaded ConfigMap: {self.get_configs()}')
 
     def load_config_file(self, filename: str) -> dict:
-        '''Load configs from a file overwriting any existing parameter with a new value.
+        """Load configs from a file overwriting any existing parameter with a new value.
 
-        Args:
-            filename: a py or json file with a dictionary of parameter:value mappings.
+    Args:
+        filename: a py or json file with a dictionary of parameter:value
+          mappings.
 
-        Returns:
-          dictionary with all config parameters after updates from the file.
-          '''
+    Returns:
+      dictionary with all config parameters after updates from the file.
+    """
         if filename:
             self.add_configs(read_py_dict_from_file(filename))
         return self._config_dict
 
     def load_config_string(self, config_params_str: str) -> dict:
-        '''Loads a  JSON config dictionary overriding existing configs.
+        """Loads a  JSON config dictionary overriding existing configs.
 
-        Args:
-          config_params_str: JSON string with a dictionary of parameter:value mappings.
+    Args:
+      config_params_str: JSON string with a dictionary of parameter:value
+        mappings.
 
-        Returns:
-          dictionary with all config parameters after updates.
-        '''
+    Returns:
+      dictionary with all config parameters after updates.
+    """
         if config_params_str:
             param_dict = ast.literal_eval(config_params_str)
             self.add_configs(param_dict)
         return self._config_dict
 
     def add_configs(self, configs: dict) -> dict:
-        '''Add new or replace existing config parameters
-
-        Nested parameters with dict, or list values are replaced.
-        Use update_config() for a deep-update of nested parameters.
+        """Add new or replace existing config parameters
+
+    Nested parameters with dict, or list values are replaced.
+    Use update_config() for a deep-update of nested parameters.
+
+    For example, assume config-dict has a nested dict:
+      with an config dict set as follows: self._config_dict = {
+        'int-param': 10,
+        'nested-dict1': {
+          'param1': 123,
+        }
+      }
+      add_config({ 'nested-dict1': { 'param2': abc })
+      will return {
+         'int-param': 10,
+         'nested-dict1': {
+            'param2': abc,  # older key:values from nested-dict removed.
+         }
+      }
 
-        For example, assume config-dict has a nested dict:
-          with an config dict set as follows: self._config_dict = {
-            'int-param': 10,
-            'nested-dict1': {
-              'param1': 123,
-            }
-          }
-          add_config({ 'nested-dict1': { 'param2': abc })
-          will return {
-             'int-param': 10,
-             'nested-dict1': {
-                'param2': abc,  # older key:values from nested-dict removed.
-             }
-          }
-
-        Args:
-            configs: dictionary with new parameter:value mappings
-              that are updated into existing dict.
-              Nested dict objects within the dict are replaced.
+    Args:
+        configs: dictionary with new parameter:value mappings that are updated
+          into existing dict. Nested dict objects within the dict are replaced.
 
-        Returns:
-            dictionary with all parameter:value mappings.
-        '''
+    Returns:
+        dictionary with all parameter:value mappings.
+    """
         if configs:
             self._config_dict.update(configs)
         return self._config_dict
 
     def update_config(self, configs: dict) -> dict:
-        '''Does a deep update of the dict updating nested dicts as well.
-        For example, assume config-dict has a nested dict:
-          self._config_dict = {
-            'nested-dict1': {
-              'param1': 123,
-              'nested-dict2': {
-                'param2': 345,
-              }
-            }
+        """Does a deep update of the dict updating nested dicts as well.
+
+    For example, assume config-dict has a nested dict:
+
+      self._config_dict = {
+        'nested-dict1': {
+          'param1': 123,
+          'nested-dict2': {
+            'param2': 345,
           }
+        }
+      }
+
+      update_config(configs={
+        'nested-dict1': {
+          'param1': 321,
+           'param1-2': 456,
+           'nested-dict2': {
+             'param2-1': 789,
+           },
+        })
+
+      will result in an updated config_dict:
+      {
+        'nested-dict1': {
+          'param1': 321,  # updated
+           'param1-2': 456,  # added
+           'nested-dict2': {
+              'param2': 345,  # original
+             'param2-1': 789, # added
+           },
+       }
+
+    Args:
+        configs: dictionary with additional parameter:value mappings.
 
-          update_config(configs={
-            'nested-dict1': {
-              'param1': 321,
-               'param1-2': 456,
-               'nested-dict2': {
-                 'param2-1': 789,
-               },
-            })
-
-          will result in an updated config_dict:
-          {
-            'nested-dict1': {
-              'param1': 321,  # updated
-               'param1-2': 456,  # added
-               'nested-dict2': {
-                  'param2': 345,  # original
-                 'param2-1': 789, # added
-               },
-           }
-
-        Args:
-            configs: dictionary with additional parameter:value mappings.
-
-        Returns:
-            dictionary with all parameter:value mappings.
-        '''
+    Returns:
+        dictionary with all parameter:value mappings.
+    """
         return _deep_update(self._config_dict, configs)
 
     def get(self,
             parameter: str,
             default_value=None) -> Union[str, int, float, list, dict]:
-        '''Return the value of a named config parameter.
+        """Return the value of a named config parameter.
 
-        Args:
-            parameter: name of the parameter to lookup
-            default_value: Default value to be returned if the parameter doesn't exist.
+    Args:
+        parameter: name of the parameter to lookup
+        default_value: Default value to be returned if the parameter doesn't
+          exist.
 
-        Returns:
-            value of the parameter in the config dict if it exists or the default_value.
-        '''
+    Returns:
+        value of the parameter in the config dict if it exists or the
+        default_value.
+    """
         return self._config_dict.get(parameter, default_value)
 
     def get_configs(self) -> dict:
-        '''Return a reference to the config dictionary.
+        """Return a reference to the config dictionary.
 
-        Any modifications to the dict is reflected within this object as well.
-        '''
+    Any modifications to the dict is reflected within this object as well.
+    """
         return self._config_dict
 
     def set_config(self, parameter: str, value):
-        '''Set the value for a parameter overwriting one if it already exists
-        Args:
-          parameter: Name of the parameter
-          value: Value to be set.
-        '''
+        """Set the value for a parameter overwriting one if it already exists
+
+    Args:
+      parameter: Name of the parameter
+      value: Value to be set.
+    """
         self._config_dict[parameter] = value
 
     def get_config_str(self) -> str:
-        '''Returns the config dictionary as a pretty string.'''
+        """Returns the config dictionary as a pretty string."""
         return pprint.pformat(self._config_dict, indent=4)
 
     def write_config(filename: str):
-        '''Write the config dictionary into a file.
+        """Write the config dictionary into a file.
 
-        Args:
-          filename: name of the file to write.
-        '''
+    Args:
+      filename: name of the file to write.
+    """
         with open(filename, 'w') as file:
             file.write(self.get_config_str())
 
 
 def get_config_map_from_file(filename: str) -> ConfigMap:
-    '''Returns a ConfigMap object with parameters loaded from a file.
+    """Returns a ConfigMap object with parameters loaded from a file.
 
-    Args:
-      filename: name of the file to load.
+  Args:
+    filename: name of the file to load.
 
-    Returns:
-      ConfigMap object with all the parameters loaded into the config_dict.
-    '''
+  Returns:
+    ConfigMap object with all the parameters loaded into the config_dict.
+  """
     return ConfigMap(filename=filename)
 
 
 def _deep_update(src: dict, add_dict: dict) -> dict:
-    '''Deep update of parameters in add_dict into src.
+    """Deep update of parameters in add_dict into src.
 
-    Args:
-      src: source dictionary into which new parameters are added.
-      add_dict: dictionary with new parameters to be added.
+  Args:
+    src: source dictionary into which new parameters are added.
+    add_dict: dictionary with new parameters to be added.
 
-    Returns:
-      src dictionary with updated parameters.
+  Returns:
+    src dictionary with updated parameters.
 
-    Note:
-      Assumes the new dictionary has same type(dict/list) for updated parameters.
-    '''
+  Note:
+    Assumes the new dictionary has same type(dict/list) for updated parameters.
+  """
     for k, v in add_dict.items():
         if isinstance(v, collections.abc.Mapping):
             src[k] = _deep_update(src.get(k, {}), v)
         elif isinstance(v, list):
             # TODO: deep update of list
+            if k not in src:
+                src[k] = list()
             src[k].extend(v)
         elif isinstance(v, set):
             # TODO: deep update of set
+            if k not in src:
+                src[k] = set()
             src[k].update(v)
         else:
             src[k] = v
@@ -271,38 +285,31 @@ def _deep_update(src: dict, add_dict: dict) -> dict:
 
 
 def read_py_dict_from_file(filename: str) -> dict:
-    '''Read a python dict from a file.
-
-    Args:
-      filename: JSON or a python file containing dict of parameter to value mappings.
-        The file can have comments and extra commas at the end.
-        Example: '{ 'abc': 123, 'def': 'lmn' }
-        Note: It assumes bools are in Python: True, False and None is used for 'null'.
-
-    Returns:
-      dictionary loaded from the file.
-
-    Raises:
-      exceptions on parsing errors string dict from literal_eval()
-    '''
-    logging.info(f'Reading python dict from {filename}...')
-    with open(filename) as file:
-        dict_str = file.read()
-
-    # Load the map assuming a python dictionary.
-    # Can also be used with JSON with trailing commas and comments.
-    param_dict = ast.literal_eval(dict_str)
+    """Read a python dict from a file.
+
+  Args:
+    filename: JSON or a python file containing dict of parameter to value
+      mappings. The file can have comments and extra commas at the end.
+      Example: '{ 'abc': 123, 'def': 'lmn' }
+      Note: It assumes bools are in Python: True, False and None is used for
+        'null'.
+
+  Returns:
+    dictionary loaded from the file.
+
+  Raises:
+    exceptions on parsing errors string dict from literal_eval()
+  """
+    param_dict = file_util.file_load_py_dict(filename)
     logging.debug(f'Loaded {filename} into dict {param_dict}')
     return param_dict
 
 
 def write_py_dict_to_file(py_dict: dict, filename: str):
-    '''Write a python dict into a file.
+    """Write a python dict into a file.
 
-    Args:
-      py_dict: Dictionary to save into the file.
-      filename: file to write into.
-    '''
-    logging.info(f'Writing python dict into {filename}')
-    with open(filename, 'w') as file:
-        file.write(pprint.pformat(py_dict, indent=4))
+  Args:
+    py_dict: Dictionary to save into the file.
+    filename: file to write into.
+  """
+    file_util.file_write_py_dict(py_dict, filename)
diff --git a/util/counters.py b/util/counters.py
index 65e43c4801..6f7a0a3285 100644
--- a/util/counters.py
+++ b/util/counters.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 '''Class for dictionary of named counters.'''
 
+import os
+import psutil
 import sys
 import time
 
@@ -89,7 +91,7 @@ def __init__(self,
 
     def __del__(self):
         '''Log the counters.'''
-        self._update_processing_rate()
+        self._update_periodic_counters()
         logging.info(self.get_counters_string())
 
     def add_counter(self,
@@ -212,7 +214,7 @@ def print_counters(self, file=sys.stderr):
         Args:
             file: file handle to emit counters string.
         '''
-        self._update_processing_rate()
+        self._update_periodic_counters()
         print(self.get_counters_string(), file=file)
 
     def print_counters_periodically(self):
@@ -234,7 +236,7 @@ def reset_start_time(self):
     def set_prefix(self, prefix: str):
         '''Set the prefix for the counter names.
         Also resets the start_time and processing rate counters.'''
-        self._update_processing_rate()
+        self._update_periodic_counters()
         self._prefix = prefix
         self.reset_start_time()
         logging.info(self.get_counters_string())
@@ -251,6 +253,11 @@ def _get_counter_name(self, name: str, debug_context: str = None):
             name = name + f'_{debug_context}'
         return name
 
+    def _update_periodic_counters(self):
+        '''Update periodic counters.'''
+        self._update_processing_rate()
+        self._update_process_counters()
+
     def _update_processing_rate(self):
         '''Update the processing rate and remaining time.
         Uses the option: 'processed' to get the counter for processing rate
@@ -271,3 +278,13 @@ def _update_processing_rate(self):
         if totals:
             self.set_counter('process_remaining_time',
                              max(0, (totals - num_processed)) / rate)
+
+    def _update_process_counters(self):
+        '''Update process counters for memory and time.'''
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        self.max_counter('process-mem-rss', mem.rss)
+        self.max_counter('process-mem', mem.vms)
+        cpu_times = process.cpu_times()
+        self.set_counter('process-time-user-secs', cpu_times.user)
+        self.set_counter('process-time-sys-secs', cpu_times.system)
diff --git a/util/dc_api_wrapper.py b/util/dc_api_wrapper.py
index b0f12dfb8b..9024fa123f 100644
--- a/util/dc_api_wrapper.py
+++ b/util/dc_api_wrapper.py
@@ -11,17 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-'''Wrapper utilities for data commons API.'''
+"""Wrapper utilities for data commons API."""
 
-import sys
+from collections import OrderedDict
 import os
-import datacommons as dc
-import requests_cache
+import sys
 import time
 import urllib
 
 from absl import logging
-from collections import OrderedDict
+import datacommons as dc
+import requests_cache
 
 _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(_SCRIPT_DIR)
@@ -39,28 +39,33 @@
 _DC_API_PATH_RESOLVE_COORD = '/v1/recon/resolve/coordinate'
 
 
-def dc_api_wrapper(function,
-                   args: dict,
-                   retries: int = 3,
-                   retry_secs: int = 1,
-                   use_cache: bool = False,
-                   api_root: str = None):
-    '''Wrapper for a DC APi call with retries and caching.
-    Returns the result from the DC APi call function.
-    In case of errors, retries the function with a delay a fixed number of times.
-
-    Args:
-      function: The DataCommons API function.
-      args: dictionary with any the keyword arguments for the DataCommons API function.
-      retries: Number of retries in case of HTTP errors.
-      retry_sec: Interval in seconds between retries for which caller is blocked.
-      use_cache: If True, uses request cache for faster response.
-      api_root: The API server to use. Default is 'http://api.datacommons.org'.
-         To use autopush with more recent data, set it to 'http://autopush.api.datacommons.org'
-
-    Returns:
-      The response from the DataCommons API call.
-    '''
+def dc_api_wrapper(
+    function,
+    args: dict,
+    retries: int = 3,
+    retry_secs: int = 1,
+    use_cache: bool = False,
+    api_root: str = None,
+):
+    """Wrapper for a DC APi call with retries and caching.
+
+  Returns the result from the DC APi call function. In case of errors, retries
+  the function with a delay a fixed number of times.
+
+  Args:
+    function: The DataCommons API function.
+    args: dictionary with any the keyword arguments for the DataCommons API
+      function.
+    retries: Number of retries in case of HTTP errors.
+    retry_sec: Interval in seconds between retries for which caller is blocked.
+    use_cache: If True, uses request cache for faster response.
+    api_root: The API server to use. Default is 'http://api.datacommons.org'. To
+      use autopush with more recent data, set it to
+      'http://autopush.api.datacommons.org'
+
+  Returns:
+    The response from the DataCommons API call.
+  """
     if api_root:
         dc.utils._API_ROOT = api_root
         logging.debug(f'Setting DC API root to {api_root} for {function}')
@@ -80,42 +85,54 @@ def dc_api_wrapper(function,
         for attempt in range(retries):
             try:
                 logging.debug(
-                    f'Invoking DC API {function}, #{attempt} with {args}, retries={retries}'
-                )
+                    f'Invoking DC API {function}, #{attempt} with {args},'
+                    f' retries={retries}')
                 response = function(**args)
                 logging.debug(
                     f'Got API response {response} for {function}, {args}')
                 return response
-            except KeyError:
-                # Exception in case of API error.
+            except KeyError as e:
+                # Exception in case of missing dcid. Don't retry.
+                logging.error(f'Got exception for api: {function}, {e}')
                 return None
-            except urllib.error.URLError:
+            except (urllib.error.URLError, urllib.error.HTTPError,
+                    ValueError) as e:
                 # Exception when server is overloaded, retry after a delay
                 if attempt >= retries:
+                    logging.error(
+                        f'Got exception for api: {function}, {e}, no more retries'
+                    )
                     raise urllib.error.URLError
                 else:
                     logging.debug(
-                        f'Retrying API {function} after {retry_secs}...')
+                        f'Got exception {e}, retrying API {function} after'
+                        f' {retry_secs}...')
                     time.sleep(retry_secs)
     return None
 
 
-def dc_api_batched_wrapper(function,
-                           dcids: list,
-                           args: dict,
-                           config: dict = None) -> dict:
-    '''A wrapper for DC API on dcids with batching support.
+def dc_api_batched_wrapper(
+    function,
+    dcids: list,
+    args: dict,
+    dcid_arg_kw: str = 'dcid',
+    headers: dict = {},
+    config: dict = None,
+) -> dict:
+    """A wrapper for DC API on dcids with batching support.
+
     Returns the dictionary result for the function call across all arguments.
-  It batches the dcids to make multiple calls to the DC API and merges all results.
+  It batches the dcids to make multiple calls to the DC API and merges all
+  results.
 
   Args:
     function: DC API to be invoked. It should have dcids as one of the arguments
       and should return a dictionary with dcid as the key.
-    dcids: List of dcids to be invoked with the function.
-        The namespace is stripped from the dcid before the call to the DC API.
+    dcids: List of dcids to be invoked with the function. The namespace is
+      stripped from the dcid before the call to the DC API.
     args: Additional arguments for the function call.
-    config: dictionary of DC API configuration settings.
-      The supported settings are:
+    config: dictionary of DC API configuration settings. The supported settings
+      are:
         dc_api_batch_size: Number of dcids to invoke per API call.
         dc_api_retries: Number of times an API can be retried.
         dc_api_retry_sec: Interval in seconds between retries.
@@ -124,7 +141,7 @@ def dc_api_batched_wrapper(function,
 
   Returns:
     Merged function return values across all dcids.
-  '''
+  """
     if not config:
         config = {}
     api_result = {}
@@ -132,8 +149,8 @@ def dc_api_batched_wrapper(function,
     num_dcids = len(dcids)
     api_batch_size = config.get('dc_api_batch_size', dc.utils._MAX_LIMIT)
     logging.debug(
-        f'Calling DC API {function} on {len(dcids)} dcids in batches of {api_batch_size} with args: {args}...'
-    )
+        f'Calling DC API {function} on {len(dcids)} dcids in batches of'
+        f' {api_batch_size} with args: {args}...')
     while index < num_dcids:
         #  dcids in batches.
         dcids_batch = [
@@ -141,11 +158,14 @@ def dc_api_batched_wrapper(function,
         ]
         index += api_batch_size
         args['dcids'] = dcids_batch
-        batch_result = dc_api_wrapper(function, args,
-                                      config.get('dc_api_retries', 3),
-                                      config.get('dc_api_retry_secs', 5),
-                                      config.get('dc_api_use_cache', False),
-                                      config.get('dc_api_root', None))
+        batch_result = dc_api_wrapper(
+            function,
+            args,
+            config.get('dc_api_retries', 3),
+            config.get('dc_api_retry_secs', 5),
+            config.get('dc_api_use_cache', False),
+            config.get('dc_api_root', None),
+        )
         if batch_result:
             api_result.update(batch_result)
             logging.debug(f'Got DC API result for {function}: {batch_result}')
@@ -155,17 +175,19 @@ def dc_api_batched_wrapper(function,
 
 
 def dc_api_is_defined_dcid(dcids: list, wrapper_config: dict = None) -> dict:
-    '''Returns a dicttionary with dcids mapped to True/False based on whether
-    the dcid is defined in the API and has a 'typeOf' property.
-       Uses the property_value() DC API to lookup 'typeOf' for each dcid.
-       dcids not defined in KG get a value of False.
-    Args:
-      dcids: List of dcids. The namespace is stripped from the dcid.
-      wrapper_config: dictionary of configurationparameters for the wrapper.
-         See dc_api_batched_wrapper and dc_api_wrapper for details.
-    Returns:
-      dictionary with each input dcid mapped to a True/False value.
-    '''
+    """Returns a dictionary with dcids mapped to True/False based on whether
+
+  the dcid is defined in the API and has a 'typeOf' property.
+     Uses the property_value() DC API to lookup 'typeOf' for each dcid.
+     dcids not defined in KG get a value of False.
+  Args:
+    dcids: List of dcids. The namespace is stripped from the dcid.
+    wrapper_config: dictionary of configurationparameters for the wrapper. See
+      dc_api_batched_wrapper and dc_api_wrapper for details.
+
+  Returns:
+    dictionary with each input dcid mapped to a True/False value.
+  """
     api_function = dc.get_property_values
     args = {
         'prop': 'typeOf',
@@ -183,26 +205,68 @@ def dc_api_is_defined_dcid(dcids: list, wrapper_config: dict = None) -> dict:
     return response
 
 
+def dc_api_get_node_property(dcids: list,
+                             prop: str,
+                             wrapper_config: dict = None) -> dict:
+    """Returns a dictionary keyed by dcid with { prop:value } for each dcid.
+
+     Uses the get_property_values() DC API to lookup the property for each dcid.
+
+  Args:
+    dcids: List of dcids. The namespace is stripped from the dcid.
+    wrapper_config: dictionary of configurationparameters for the wrapper. See
+      dc_api_batched_wrapper and dc_api_wrapper for details.
+
+  Returns:
+    dictionary with each input dcid mapped to a True/False value.
+  """
+    api_function = dc.get_property_values
+    args = {
+        'prop': prop,
+        'out': True,
+    }
+    api_result = dc_api_batched_wrapper(api_function, dcids, args,
+                                        wrapper_config)
+    response = {}
+    for dcid in dcids:
+        dcid_stripped = _strip_namespace(dcid)
+        value = api_result.get(dcid_stripped)
+        if value:
+            response[dcid] = {prop: value}
+    return response
+
+
 def dc_api_get_node_property_values(dcids: list,
                                     wrapper_config: dict = None) -> dict:
-    '''Returns all the property values for a set of dcids from the DC API.
-    Args:
-      dcids: list of dcids to lookup
-      wrapper_config: configuration parameters for the wrapper.
-         See dc_api_batched_wrapper() and dc_api_wrapper() for details.
-    Returns:
-      dictionary with each dcid with the namspace 'dcid:' as the key
-      mapped to a dictionary of property:value.
-    '''
+    """Returns all the property values for a set of dcids from the DC API.
+
+  Args:
+    dcids: list of dcids to lookup
+    wrapper_config: configuration parameters for the wrapper. See
+      dc_api_batched_wrapper() and dc_api_wrapper() for details.
+
+  Returns:
+    dictionary with each dcid with the namspace 'dcid:' as the key
+    mapped to a dictionary of property:value.
+  """
     predefined_nodes = OrderedDict()
     api_function = dc.get_triples
     api_triples = dc_api_batched_wrapper(api_function, dcids, {},
                                          wrapper_config)
     if api_triples:
         for dcid, triples in api_triples.items():
+            if (_strip_namespace(dcid) not in dcids and
+                    _add_namespace(dcid) not in dcids):
+                continue
             pvs = {}
             for d, prop, val in triples:
-                pvs[prop] = val
+                if d == dcid and val:
+                    # quote string values with spaces if needed
+                    if ' ' in val and val[0] != '"':
+                        val = '"' + val + '"'
+                    if prop in pvs:
+                        val = pvs[prop] + ',' + val
+                    pvs[prop] = val
             if len(pvs) > 0:
                 if 'Node' not in pvs:
                     pvs['Node'] = _add_namespace(dcid)
@@ -210,16 +274,16 @@ def dc_api_get_node_property_values(dcids: list,
     return predefined_nodes
 
 
-def dc_api_resolve_placeid(dcids: list) -> dict:
-    '''Returns the resolved dcid for each of the placeid.
+def dc_api_resolve_placeid(dcids: list, in_prop: str = 'placeId') -> dict:
+    """Returns the resolved dcid for each of the placeid.
 
-    Args:
-      dcids: list of placeids to be resolved.
+  Args:
+    dcids: list of placeids to be resolved.
 
-    Returns:
-      dictionary keyed by input placeid with reoslved dcid as value.
-    '''
-    data = {'in_prop': 'placeId', 'out_prop': 'dcid'}
+  Returns:
+    dictionary keyed by input placeid with reoslved dcid as value.
+  """
+    data = {'in_prop': in_prop, 'out_prop': 'dcid'}
     data['ids'] = dcids
     num_ids = len(dcids)
     api_url = dc.utils._API_ROOT + _DC_API_PATH_RESOLVE_ID
@@ -241,14 +305,14 @@ def dc_api_resolve_placeid(dcids: list) -> dict:
 
 
 def dc_api_resolve_latlng(dcids: list) -> dict:
-    '''Returns the resolved dcid for each of the placeid.
+    """Returns the resolved dcid for each of the placeid.
 
-    Args:
-      dcids: list of placeids to be resolved.
+  Args:
+    dcids: list of placeids to be resolved.
 
-    Returns:
-      dictionary keyed by input placeid with reoslved dcid as value.
-    '''
+  Returns:
+    dictionary keyed by input placeid with reoslved dcid as value.
+  """
     data = {}
     data['coordinates'] = dcids
     num_ids = len(dcids)
@@ -264,8 +328,8 @@ def dc_api_resolve_latlng(dcids: list) -> dict:
     if recon_resp:
         for entity in recon_resp.get('placeCoordinates', []):
             dcids = entity.get('placeDcids', '')
-            lat = entity.get("latitude", "")
-            lng = entity.get("longitude", "")
+            lat = entity.get('latitude', '')
+            lng = entity.get('longitude', '')
             place_id = f'{lat}{lng}'
             if place_id and dcids:
                 results[place_id] = entity
@@ -273,17 +337,19 @@ def dc_api_resolve_latlng(dcids: list) -> dict:
 
 
 def _add_namespace(value: str, namespace: str = 'dcid') -> str:
-    '''Returns the value with a namespace prefix for references.
-    Args:
-      value: string to which namespace is to be added.
-    Returns:
-      value with the namespace prefix if the value is not a quoted string
-      and doesn't have a namespace already.
-      O/w return the value as is.
-
-    Any sequence of letters followed by a ':' is treated as a namespace.
-    Quoted strings are assumed to start with '"' and won't get a namespace.
-    '''
+    """Returns the value with a namespace prefix for references.
+
+  Args:
+    value: string to which namespace is to be added.
+
+  Returns:
+    value with the namespace prefix if the value is not a quoted string
+    and doesn't have a namespace already.
+    O/w return the value as is.
+
+  Any sequence of letters followed by a ':' is treated as a namespace.
+  Quoted strings are assumed to start with '"' and won't get a namespace.
+  """
     if value and isinstance(value, str):
         if value[0].isalpha() and value.find(':') < 0:
             return f'{namespace}:{value}'
@@ -291,15 +357,17 @@ def _add_namespace(value: str, namespace: str = 'dcid') -> str:
 
 
 def _strip_namespace(value: str) -> str:
-    '''Returns the value without the namespace prefix.
-    Args:
-      value: string from which the namespace prefix is to be removed.
-    Returns:
-      value without the namespace prefix if there was a namespace
-
-    Any sequence of letters followed by a ':' is treated as a namespace.
-    Quoted strings are assumed to start with '"' and won't be filtered.
-    '''
+    """Returns the value without the namespace prefix.
+
+  Args:
+    value: string from which the namespace prefix is to be removed.
+
+  Returns:
+    value without the namespace prefix if there was a namespace
+
+  Any sequence of letters followed by a ':' is treated as a namespace.
+  Quoted strings are assumed to start with '"' and won't be filtered.
+  """
     if value and isinstance(value, str) and value[0].isalnum():
         return value[value.find(':') + 1:].strip()
     return value
diff --git a/util/dc_api_wrapper_test.py b/util/dc_api_wrapper_test.py
index 099f1be159..2a38d41e41 100644
--- a/util/dc_api_wrapper_test.py
+++ b/util/dc_api_wrapper_test.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-'''Tests for dc_api_wrapper.'''
+"""Tests for dc_api_wrapper."""
 
 import os
 import sys
@@ -30,7 +30,7 @@
 class TestDCAPIWrapper(unittest.TestCase):
 
     def test_dc_api_wrapper(self):
-        '''Test the wrapper for DC API.'''
+        """Test the wrapper for DC API."""
         api_function = dc.get_property_labels
         dcids = [
             'Count_Person',  # 'dcid:' namespace will be removed.
@@ -42,7 +42,7 @@ def test_dc_api_wrapper(self):
         self.assertTrue('typeOf' in response['Count_Person'])
 
     def test_dc_api_batched_wrapper(self):
-        '''Test DC API wrapper for batched calls.'''
+        """Test DC API wrapper for batched calls."""
         api_function = dc.get_property_values
         dcids = [
             'Count_Person',  # Statvar defined in DC
@@ -58,7 +58,7 @@ def test_dc_api_batched_wrapper(self):
         self.assertFalse(response['NewStatVar_NotInDC'])
 
     def test_dc_api_is_defined_dcid(self):
-        '''Test API wrapper for defined DCIDs.'''
+        """Test API wrapper for defined DCIDs."""
         dcids = [
             'geoId/06',  # Geo Id defined.
             'country/ZZZ',  # Geo Id not defined.
@@ -66,10 +66,12 @@ def test_dc_api_is_defined_dcid(self):
             'schema:Year',  # Class
         ]
         response = dc_api.dc_api_is_defined_dcid(
-            dcids, {
+            dcids,
+            {
                 'dc_api_batch_size': 2,
-                'dc_api_root': 'http://autopush.api.datacommons.org'
-            })
+                'dc_api_root': 'http://autopush.api.datacommons.org',
+            },
+        )
         self.assertTrue(response is not None)
         self.assertEqual(len(response), len(dcids))
         self.assertTrue(response['geoId/06'])
@@ -79,8 +81,8 @@ def test_dc_api_is_defined_dcid(self):
         self.assertTrue(response['dcs:value'])
 
     def test_dc_get_node_property_values(self):
-        '''Test API wrapper to get all property:values for a node.'''
-        node_pvs = dc_api.dc_api_get_node_property_values(['dcs:Count_Person'])
+        """Test API wrapper to get all property:values for a node."""
+        node_pvs = dc_api.dc_api_get_node_property_values(['dcid:Count_Person'])
         self.assertTrue(node_pvs)
         # Verify the resposnse has dcid with the namespace prefix 'dcid:'
         self.assertTrue('dcid:Count_Person' in node_pvs)
diff --git a/util/download_util.py b/util/download_util.py
index aac96a10df..091a251773 100644
--- a/util/download_util.py
+++ b/util/download_util.py
@@ -90,6 +90,7 @@ def test_my_function(self):
 def request_url(url: str,
                 params: dict = {},
                 method: str = 'GET',
+                headers: dict = {},
                 output: str = 'text',
                 timeout: int = 30,
                 retries: int = 3,
@@ -147,12 +148,18 @@ def request_url(url: str,
         for attempt in range(retries):
             try:
                 logging.debug(
-                    f'Downloading URL {url}, params:{params}, {method} #{attempt}, retries={retries}'
+                    f'Downloading URL {url}, headers:{headers} params:{params}, {method} #{attempt}, retries={retries}'
                 )
                 if 'get' in method.lower():
-                    response = requests.get(url, params=params, timeout=timeout)
+                    response = requests.get(url,
+                                            headers=headers,
+                                            params=params,
+                                            timeout=timeout)
                 else:
-                    response = requests.post(url, json=params, timeout=timeout)
+                    response = requests.post(url,
+                                             headers=headers,
+                                             json=params,
+                                             timeout=timeout)
                 logging.debug(
                     f'Got API response {response} for {url}, {params}')
                 if response.ok:
@@ -166,14 +173,16 @@ def request_url(url: str,
                 # Exception in case of API error.
                 return None
             except (requests.exceptions.ConnectTimeout,
-                    urllib.error.URLError) as e:
-                # Exception when server is overloaded, retry after a delay
-                if attempt >= retries:
-                    raise urllib.error.URLError
-                else:
-                    logging.debug(
-                        f'Retrying URL {url} after {retry_secs} secs ...')
-                    time.sleep(retry_secs)
+                    requests.exceptions.ConnectionError, urllib.error.URLError,
+                    urllib.error.HTTPError) as e:
+                logging.debug(f'Got exception {e} for {url}, {params}')
+
+            # retry in case of errors
+            if attempt >= retries:
+                raise urllib.error.URLError
+            else:
+                logging.debug(f'Retrying URL {url} after {retry_secs} secs ...')
+                time.sleep(retry_secs)
     return None
 
 
diff --git a/util/file_util.py b/util/file_util.py
index d961d13e95..d858c36a06 100644
--- a/util/file_util.py
+++ b/util/file_util.py
@@ -18,22 +18,27 @@
 """
 
 import ast
+import chardet
 import csv
 import fnmatch
 import glob
+import gspread
+import io
 import json
 import os
 import pickle
 import pprint
 import sys
 import tempfile
-from typing import Union
+
+import numpy as np
 
 from absl import app
 from absl import logging
 from aggregation_util import aggregate_dict, aggregate_value
 from google.cloud import storage
-import gspread
+from retry.api import retry_call
+from typing import Union
 
 
 class FileIO:
@@ -428,9 +433,12 @@ def file_get_name(file_path: str,
   Returns:
     file name combined from path, suffix and extension.
   """
-    # Create the file directory if it doesn't exist.
+    if not file_path:
+        return None
     if file_is_google_spreadsheet(file_path):
+        # Don't modify spreadsheets
         return file_path
+    # Create the file directory if it doesn't exist.
     file_makedirs(file_path)
     file_prefix, ext = os.path.splitext(file_path)
     if file_prefix.endswith(suffix):
@@ -508,6 +516,7 @@ def file_load_csv_dict(
     value_column: str = None,
     delimiter: str = ',',
     config: dict = {},
+    key_index: bool = False,
 ) -> dict:
     """Returns a dictionary loaded from a CSV file.
 
@@ -540,18 +549,26 @@ def file_load_csv_dict(
     config: dictionary of aggregation settings in case there are multiple rows
       with the same key. refer to aggregation_util.aggregate_dict for config
       settings.
-
+    key_index: if True, each row is loaded with a unique key for row index.
+      Overrides key_column and uses index as key.
   Returns:
     dictionary of {key:value} loaded from the CSV file.
   """
     csv_dict = {}
     input_files = file_get_matching(filename)
     logging.debug(f'Loading dict from csv files: {input_files}')
+    if key_column and key_index:
+        raise ValueError(
+            f'Both Key_column: {key_column} and key_index set for {filename}')
+
     for filename in input_files:
         num_rows = 0
         # Load each CSV file
         with FileIO(filename) as csvfile:
-            reader = csv.DictReader(csvfile, delimiter=delimiter)
+            reader = csv.DictReader(
+                csvfile,
+                **file_get_csv_reader_options(csvfile,
+                                              {'delimiter': delimiter}))
             if reader.fieldnames:
                 # Get the key and value column names
                 if not key_column:
@@ -567,7 +584,9 @@ def file_load_csv_dict(
             for row in reader:
                 # Get the key for the row.
                 key = None
-                if key_column in row:
+                if key_index:
+                    key = len(csv_dict)
+                elif key_column in row:
                     key = row.pop(key_column)
                 # Get the value for the key
                 value = None
@@ -597,7 +616,8 @@ def file_load_csv_dict(
 
 def file_write_csv_dict(py_dict: dict,
                         filename: str,
-                        columns: list = None) -> list:
+                        columns: list = None,
+                        key_column_name: str = 'key') -> list:
     """Returns the filename after writing py_dict with a csv row per item.
 
   Each dictionary items is written as a row in the CSV file.
@@ -629,6 +649,9 @@ def file_write_csv_dict(py_dict: dict,
       is used as the key's column name. If no columns are specified for values,
       column names are picked from each entry's value if the value is a dict.
       Else the value is written as column name 'value'.
+    key_column_name: name of the column used as key.
+      if '', the first column is used as key.
+      if set to None, the key is ignored.
 
   Returns:
     list of columns written to the output csv
@@ -638,8 +661,10 @@ def file_write_csv_dict(py_dict: dict,
     # Get the list of columns
     value_column_name = ''
     if not columns:
+        columns = []
         # Add a columns for key.
-        columns = ['key']
+        if key_column_name:
+            columns.append(key_column_name)
     if len(columns) <= 1:
         # Get columns across all entries.
         for key, value in py_dict.items():
@@ -652,7 +677,8 @@ def file_write_csv_dict(py_dict: dict,
         value_column_name = 'value'
         columns.append(value_column_name)
     # Use the first column for the key.
-    key_column_name = columns[0]
+    if key_column_name is '':
+        key_column_name = columns[0]
 
     # Get the output filename
     output_files = file_get_matching(filename)
@@ -799,7 +825,9 @@ def file_is_google_spreadsheet(filename: str) -> bool:
     return False
 
 
-def file_open_google_spreadsheet(url: str) -> gspread.spreadsheet.Spreadsheet:
+def file_open_google_spreadsheet(url: str,
+                                 retries: int = 3
+                                ) -> gspread.spreadsheet.Spreadsheet:
     """Returns the google spreasheet handle.
 
   Assumes caller has access to the spreadsheet.
@@ -811,7 +839,14 @@ def file_open_google_spreadsheet(url: str) -> gspread.spreadsheet.Spreadsheet:
     google spreadsheet object for the given url
   """
     # Get a handle for the whole spreadsheet
-    gs = _file_get_gspread_client().open_by_url(url)
+    gs = retry_call(
+        _file_get_gspread_client().open_by_url,
+        f_args=[url],
+        exceptions=gspread.exceptions.APIError,
+        tries=retries,
+    )
+    if gs is None:
+        logging.error(f'Failed to open {url}')
     return gs
 
 
@@ -943,7 +978,10 @@ def file_copy_to_spreadsheet(filename: str,
     # Read the rows from the source file
     rows = []
     with FileIO(filename) as file:
-        csv_reader = csv.reader(file, skipinitialspace=True, escapechar='\\')
+        csv_reader = csv.reader(file,
+                                skipinitialspace=True,
+                                escapechar='\\',
+                                **file_get_csv_reader_options(file))
         for row in csv_reader:
             rows.append(row)
 
@@ -963,6 +1001,120 @@ def file_copy_to_spreadsheet(filename: str,
     return ws.url
 
 
+def file_get_sample_bytes(file: str, byte_count: int = 4096) -> bytes:
+    """Returns sample bytes from file.
+
+    Args:
+      file: a file name or an open file handle.
+      byte_count: buyes to be returned.
+
+    Returns:
+      bytes of the given byte_count.
+      The file handle is reset to the start.
+    """
+    if isinstance(file, io.TextIOWrapper):
+        # File is a handle. Get the filename
+        file = file.name
+    if isinstance(file, str):
+        logging.debug(f'Getting sample {byte_count} bytes from {file}')
+        with FileIO(file, 'rb') as fh:
+            return fh.read(byte_count)
+    else:
+        return b''
+
+
+def file_get_encoding(file: str,
+                      rawdata: bytes = None,
+                      default: str = 'utf-8-sig') -> str:
+    """Returns the encoding for the file
+
+    Args:
+      file: filename whose encoding is required.
+      rawdata: content whose encoding is to be detected if available.
+      default: default encoding to be retruned if it can't be determined.
+
+    Returns:
+      string with encoding such as 'utf8'
+    """
+    if rawdata is None:
+        rawdata = file_get_sample_bytes(file)
+    encoding_result = chardet.detect(rawdata)
+    if encoding_result:
+        encoding = encoding_result.get('encoding')
+        if encoding:
+            return encoding
+    return default
+
+
+def file_get_csv_reader_options(
+        file: str,
+        default_options: dict = {},
+        data: str = None,
+        encoding: str = None,
+        delim_chars: list = [',', '	', ';', '|', ':']) -> dict:
+    """Returns a dictionary with options for the CSV file reader.
+
+    Args:
+      file: name of the csv file to get encoding
+      default_options: default options returned if not detected
+        such as 'delimiter'.
+      data: string for which delimiter is to be detected
+        If data is not given, sample data is read from the file.
+      encoding: character encoding in the file.
+      delim_chars: list of possible delimiter characters.
+        If not set, non-alphanumeric characters from the first line
+        are used as candidate delimiter characters.
+
+    Returns:
+      dict with the following:
+        'delimiter': delimiter character for CSV files.
+        'dialect': File dialect, such as 'unix', 'excel'
+    """
+    result = dict(default_options)
+
+    if data is None:
+        # Get data from file decoded with the right encoding
+        rawdata = file_get_sample_bytes(file)
+        if encoding is None:
+            encoding = file_get_encoding(file, rawdata=rawdata)
+        data = rawdata.decode(encoding)
+
+    # Get the dialect for the data
+    try:
+        dialect = csv.Sniffer().sniff(data)
+    except csv.Error:
+        # Use default as excel as it may not be detected well.
+        dialect = 'excel'
+    if dialect:
+        result['dialect'] = dialect
+
+    # Get CSV delimiter by counting possible delimiter characters
+    # across rows and picking the most common delimiter.
+    rows = data.split('\n')
+    if not delim_chars:
+        # Get non alphanumeric characters from data.
+        delim_chars = {c for c in rows[0].strip() if not c.isalnum()}
+        logging.debug(f'Looking for delimiter in %s among %s', file,
+                      delim_chars)
+    char_counts = {c: [] for c in delim_chars}
+    for index in range(1, len(rows) - 1):
+        # Count possible delimiter characters per row
+        row = rows[index]
+        for char in delim_chars:
+            char_counts[char].append(row.count(char))
+    # Get the char with the same count across rows.
+    for c in char_counts.keys():
+        c_counts = char_counts[c]
+        if c_counts:
+            c_min = min(c_counts)
+            c_med = np.median(c_counts)
+            if c_min > 0 and c_min == c_med:
+                result['delimiter'] = c
+                break
+    logging.debug('Got options for file: %s: result = %s', file, result)
+    return result
+
+
 def file_is_csv(filename: str) -> bool:
     """Returns True is the file has a .csv extension or is a spreadsheet."""
     if filename.endswith('.csv') or file_is_google_spreadsheet(filename):
diff --git a/util/statvar_dcid_generator.py b/util/statvar_dcid_generator.py
index 52a59d05fe..78c16a830d 100644
--- a/util/statvar_dcid_generator.py
+++ b/util/statvar_dcid_generator.py
@@ -14,20 +14,20 @@
 """A utility to generate dcid for statistical variables."""
 
 import copy
-import re
 import os
+import re
 import sys
 
-#pylint: disable=wrong-import-position
-#pylint: disable=import-error
+# pylint: disable=wrong-import-position
+# pylint: disable=import-error
 
 # Allows the following module imports to work when running as a script
 _SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(_SCRIPT_PATH, '.'))  # For soc_codes_names
 
 from soc_codes_names import SOC_MAP
-#pylint: enable=wrong-import-position
-#pylint: enable=import-error
+# pylint: enable=wrong-import-position
+# pylint: enable=import-error
 
 # Global constants
 # Regex to match the quantity notations - [value quantity], [quantity value]
@@ -48,10 +48,20 @@
                                      r'(?P<upper_limit>-|-?\d+(\.\d+)?)\]')
 
 # These are the default properties ignored during dcid generation
-_DEFAULT_IGNORE_PROPS = ('unit', 'Node', 'memberOf', 'typeOf',
-                         'constraintProperties', 'name', 'description',
-                         'descriptionUrl', 'label', 'url', 'alternateName',
-                         'scalingFactor')
+_DEFAULT_IGNORE_PROPS = (
+    'unit',
+    'Node',
+    'memberOf',
+    'typeOf',
+    'constraintProperties',
+    'name',
+    'description',
+    'descriptionUrl',
+    'label',
+    'url',
+    'alternateName',
+    'scalingFactor',
+)
 
 # Regex to match prefixes to be removed from constraints. The regex checks for
 # specific prefixes followed by an upper case letter or underscore. This helps
@@ -109,7 +119,7 @@
     '1026': 'LeisureHospitality',
     '1027': 'OtherServices',
     '1028': 'PublicAdministration',
-    '1029': 'Unclassified'
+    '1029': 'Unclassified',
 }
 
 # Regex to match NAICS Codes. These codes could be a single code or a range
@@ -201,12 +211,12 @@
     'householderRelatedChildrenUnder18Years': {
         'prepend': 'Householder',
         'replace': 'Child',
-        'replacement': 'RelatedChildren'
+        'replacement': 'RelatedChildren',
     },
     'householderOwnChildrenUnder18Years': {
         'prepend': 'Householder',
         'replace': 'Child',
-        'replacement': 'OwnChildren'
+        'replacement': 'OwnChildren',
     },
     'occupation': {
         'append': 'Occupation'
@@ -220,7 +230,7 @@
     'dateOfEntry': {
         'prepend': 'DateOfEntry',
         'replace': 'Date',
-        'replacement': ''
+        'replacement': '',
     },
     'placeOfBirth': {
         'prepend': 'PlaceOfBirth'
@@ -228,7 +238,7 @@
     'dateMovedIn': {
         'prepend': 'MovedInDate',
         'replace': 'Date',
-        'replacement': ''
+        'replacement': '',
     },
     'bachelorsDegreeMajor': {
         'prepend': 'BachelorOf'
@@ -265,17 +275,36 @@
     },
     'mothersEducation': {
         'prepend': 'Mother'
-    }
+    },
+    'importSource': {
+        'prepend': 'ImportFrom',
+    },
+    'exportDestination': {
+        'prepend': 'ExportTo',
+    },
+    'lendingEntity': {
+        'prepend': 'Lender',
+    },
 }
 
 # This is a list of boolean properties
 _BOOLEAN_PROPS = [
-    'hasComputer', 'hasFunctionalToilet', 'isAccessibleForFree',
-    'isEnergyStored', 'isFDAReferenceStandard', 'isFamilyFriendly',
-    'isGenomeRepresentationFull', 'isGift', 'isInternetUser',
-    'isLiquefiedNaturalGasStored', 'isLiveBroadcast', 'isNaturalGasStored',
-    'isPharmacodynamicRelationship', 'isPharmacokineticRelationship',
-    'isRefSeqGenBankAssembliesIdentical', 'isHateCrime'
+    'hasComputer',
+    'hasFunctionalToilet',
+    'isAccessibleForFree',
+    'isEnergyStored',
+    'isFDAReferenceStandard',
+    'isFamilyFriendly',
+    'isGenomeRepresentationFull',
+    'isGift',
+    'isInternetUser',
+    'isLiquefiedNaturalGasStored',
+    'isLiveBroadcast',
+    'isNaturalGasStored',
+    'isPharmacodynamicRelationship',
+    'isPharmacokineticRelationship',
+    'isRefSeqGenBankAssembliesIdentical',
+    'isHateCrime',
 ]
 
 # To map stat vars which do not follow the conventions of stat var dcid naming
@@ -283,29 +312,30 @@
 # the replacement dcid.
 _LEGACY_MAP = {
     'Count_Person_WithDisability_NoHealthInsurance':
-        'Count_Person_NoHealthInsurance_WithDisability',
+        ('Count_Person_NoHealthInsurance_WithDisability'),
     'Count_Person_NoDisability_NoHealthInsurance':
-        'Count_Person_NoHealthInsurance_NoDisability'
+        ('Count_Person_NoHealthInsurance_NoDisability'),
 }
 
 
 def _capitalize_process(word: str) -> str:
     """Capitalizes, removes namespaces, measurement constraint prefixes and
-    underscores from a word.
 
-    Manual upper casing is preferred compared to the builtin function
-    str.capitalize() because we want to change only the case of the first
-    character and ignore the case of other characters. Firstly, all namespaces
-    are removed from the string. Then, constraint prefixes and underscores
-    are removed. Lastly, the first character is upper cased.
+  underscores from a word.
+
+  Manual upper casing is preferred compared to the builtin function
+  str.capitalize() because we want to change only the case of the first
+  character and ignore the case of other characters. Firstly, all namespaces
+  are removed from the string. Then, constraint prefixes and underscores
+  are removed. Lastly, the first character is upper cased.
 
-    Args:
-        word: A string literal to capitalize and process.
+  Args:
+      word: A string literal to capitalize and process.
 
-    Returns:
-        Returns a string that can be used in dcid generation.
-        Returns None if the string is empty.
-    """
+  Returns:
+      Returns a string that can be used in dcid generation.
+      Returns None if the string is empty.
+  """
     if word:
         # Removing namespaces
         word = word[word.find(':') + 1:]
@@ -319,6 +349,15 @@ def _capitalize_process(word: str) -> str:
 
         # Removing all underscores
         word = word.replace('_', '')
+        # Remove '/' or replace with '-' when used as number separator
+        words = []
+        for tok in word.split('/'):
+            if tok:
+                if tok[0].isdigit() and len(
+                        words) > 0 and words[-1][-1].isdigit():
+                    words.append('-')
+            words.append(tok[0].upper() + tok[1:]),
+        word = ''.join(words)
 
         # Upper casing the first character
         word = word[0].upper() + word[1:]
@@ -329,19 +368,15 @@ def _capitalize_process(word: str) -> str:
 def _generate_quantity_range_name(match_dict: dict) -> str:
     """Generate a name for a quantity range.
 
-    Args:
-        match_dict: A dictionary containing quantity range regex groups.
-          Expected syntax of match_dict is
-          {
-            'lower_limit': <value>,
-            'upper_limit': <value>,
-            'quantity': <value>
-          }
-
-    Returns:
-        A string representing the quantity range name to be used in the dcid.
-        Returns None if any of the expected keys are not in the dictionary.
-    """
+  Args:
+      match_dict: A dictionary containing quantity range regex groups. Expected
+        syntax of match_dict is { 'lower_limit': <value>, 'upper_limit':
+        <value>, 'quantity': <value> }
+
+  Returns:
+      A string representing the quantity range name to be used in the dcid.
+      Returns None if any of the expected keys are not in the dictionary.
+  """
     try:
         lower_limit = match_dict['lower_limit']
         upper_limit = match_dict['upper_limit']
@@ -369,19 +404,20 @@ def _generate_quantity_range_name(match_dict: dict) -> str:
 
 def _naics_code_to_name(naics_val: str) -> str:
     """Converts NAICS codes to their industry using the _NAICS_MAP.
-    Args:
-        naics_val: A NAICS string literal to process.
-          Expected syntax of naics_val - NAICS/{codes}
-          '-' can be used to denote range of codes that may or may not belong
-          to the same industry. For eg, 44-45 will be mapped to 'RetailTrade'.
-          '_' can be used to represent multiple industries. For eg, 51_52 will
-          be mapped to 'InformationFinanceInsurance'. A combination of '-' and
-          '_' is acceptable.
-    Returns:
-        A string with all NAICS codes changed to their respective industry.
-        This string can be used in dcid generation. Returns None if the string
-        is empty or if the string does not follow the expected syntax.
-    """
+
+  Args:
+      naics_val: A NAICS string literal to process. Expected syntax of naics_val
+        - NAICS/{codes} '-' can be used to denote range of codes that may or may
+        not belong to the same industry. For eg, 44-45 will be mapped to
+        'RetailTrade'. '_' can be used to represent multiple industries. For eg,
+        51_52 will be mapped to 'InformationFinanceInsurance'. A combination of
+        '-' and '_' is acceptable.
+
+  Returns:
+      A string with all NAICS codes changed to their respective industry.
+      This string can be used in dcid generation. Returns None if the string
+      is empty or if the string does not follow the expected syntax.
+  """
 
     # Helper function to process NAICS ranges
     def _process_naics_range(range_str: str) -> str:
@@ -419,7 +455,9 @@ def _process_naics_range(range_str: str) -> str:
             if match_str.find('-') != -1:  # Range
                 industry_str = _process_naics_range(match_str)
             else:
-                industry_str = _NAICS_MAP[match_str]
+                industry_str = _NAICS_MAP.get(match_str)
+                if not industry_str:
+                    return None
             processed_str = processed_str + industry_str
         return processed_str
     return None
@@ -427,16 +465,18 @@ def _process_naics_range(range_str: str) -> str:
 
 def _soc_code_to_name(soc_val: str) -> str:
     """Converts SOCv2018 codes to their industry using the SOC_MAP from
-    soc_codes_names.py
-
-    Args:
-        soc_val: A SOCv2018 string literal to process.
-          Expected syntax of soc_val - SOCv2018/{code}
-    Returns:
-        A string with SOC code changed to it's occupation.
-        This string can be used in dcid generation. Returns the original string
-        if the code is not in the SOC_MAP. Returns None if the string is empty.
-    """
+
+  soc_codes_names.py
+
+  Args:
+      soc_val: A SOCv2018 string literal to process. Expected syntax of soc_val
+        - SOCv2018/{code}
+
+  Returns:
+      A string with SOC code changed to it's occupation.
+      This string can be used in dcid generation. Returns the original string
+      if the code is not in the SOC_MAP. Returns None if the string is empty.
+  """
     if soc_val:
         processed_str = soc_val
 
@@ -458,20 +498,22 @@ def _prepend_append_replace(word,
                             replace='',
                             replacement=''):
     """Prepends, appends and replaces text in a word.
-    Args:
-        word: A string literal to prepend, append or replace on.
-        prepend: A string literal to prepend to word.
-        append: A string literal to append to word.
-        replace: A string literal that repersents a substring in word to be
-          replaced.
-        replacement: A string literal. In word, all occurances of replace will
-          be changed to replacement.
-    Returns:
-        A string after appending, prepending and replacing to word.
-    """
+
+  Args:
+      word: A string literal to prepend, append or replace on.
+      prepend: A string literal to prepend to word.
+      append: A string literal to append to word.
+      replace: A string literal that repersents a substring in word to be
+        replaced.
+      replacement: A string literal. In word, all occurances of replace will be
+        changed to replacement.
+
+  Returns:
+      A string after appending, prepending and replacing to word.
+  """
     if replace:
         word = word.replace(replace, replacement)
-    if prepend:
+    if prepend and not word.lower().startswith(prepend.lower()):
         word = prepend + word
     if append:
         word = word + append
@@ -481,18 +523,14 @@ def _prepend_append_replace(word,
 def _generate_quantity_name(match_dict: dict) -> str:
     """Generate a name for a quantity.
 
-    Args:
-        match_dict: A dictionary containing quantity regex groups.
-          Expected syntax of match_dict
-          {
-            'value': <value>,
-            'quantity': <value>
-          }
-
-    Returns:
-        A string representing the quantity name to be used in the dcid.
-        Returns None if any of the expected keys are not in the dictionary.
-    """
+  Args:
+      match_dict: A dictionary containing quantity regex groups. Expected syntax
+        of match_dict { 'value': <value>, 'quantity': <value> }
+
+  Returns:
+      A string representing the quantity name to be used in the dcid.
+      Returns None if any of the expected keys are not in the dictionary.
+  """
     try:
         value = match_dict['value']
         quantity = match_dict['quantity']
@@ -505,37 +543,41 @@ def _generate_quantity_name(match_dict: dict) -> str:
 
 def _generate_boolean_value_name(prop: str, value: str) -> str:
     """Generates a name given a boolean property and value.
-    Args:
-        prop: A string literal representing the boolean property name.
-        value: A string literal representing the boolean property value.
-    Returns:
-        A string that can be used in dcid generation
-    """
+
+  Args:
+      prop: A string literal representing the boolean property name.
+      value: A string literal representing the boolean property value.
+
+  Returns:
+      A string that can be used in dcid generation
+  """
     if value in ('True', 'False'):
-        constraint_value = value == "True"
+        constraint_value = value == 'True'
         pop = None
         prefix = None
-        if prop.startswith("has"):
+        if prop.startswith('has'):
             pop = prop[3:]
-            prefix = "Has" if constraint_value else "No"
-        elif prop.startswith("is"):
+            prefix = 'Has' if constraint_value else 'No'
+        elif prop.startswith('is'):
             pop = prop[2:]
-            prefix = "Is" if constraint_value else "Not"
+            prefix = 'Is' if constraint_value else 'Not'
         else:
-            assert False, f"Unhandled prefix {prop}"
+            assert False, f'Unhandled prefix {prop}'
         return prefix + pop
     return None
 
 
 def _process_constraint_property(prop: str, value: str) -> str:
     """Processes constraint property, value and returns a name that can be used
-    in dcid generation.
-    Args:
-        prop: A string literal representing the constraint property name.
-        value: A string literal representing the constraint property value.
-    Returns:
-        A string that can be used in dcid generation.
-    """
+
+  in dcid generation.
+  Args:
+      prop: A string literal representing the constraint property name.
+      value: A string literal representing the constraint property value.
+
+  Returns:
+      A string that can be used in dcid generation.
+  """
     if 'NAICS' in value:
         name = _naics_code_to_name(value)
     elif 'SOCv2018/' in value:
@@ -568,68 +610,66 @@ def _process_constraint_property(prop: str, value: str) -> str:
 def get_statvar_dcid(stat_var_dict: dict, ignore_props: list = None) -> str:
     """Generates the dcid given a statistical variable.
 
-    The generated dcid will follow the pattern
-    <statType>_<measuredProp>_<populationType>_<constraintVal1>_<constraintVal2>
-
-    1. measurementQualifier is added as a prefix to the dcid.
-    2. statType is included when it is not measuredValue.
-    3. measurementDenominator is added as a suffix to the dcid.
-    4. Constraints are sorted alphabetically based on the prop and values are
-         added to the dcid.
-    5. Existing dcids may not follow the above conventions. The _LEGACY_MAP maps
-         generated dcids to their existing dcid.
-    6. NAICS and SOC codes are replaced with their industry and occupation names
-         respectively. See _NAICS_MAP and util/soc_codes_names.py for the
-         mapping.
-    7. Boolean constraints are replaced by their populations. For example,
-         p=isInternetUser and v=True/False becomes v=isInternetUser/
-         notInternetUser. See _BOOLEAN_PROPS for the properties that are
-         considered for this renaming.
-    8. Quantities and Quantity Ranges are changed into a name to be used in the
-         dcid. For example p=age and v=[10 20 Years] becomes v=10To20Years.
-    9. Certain variables have text prepended or appended to their constraints to
-         improve readability. See _PREPEND_APPEND_REPLACE_MAP for more details.
-
-    Args:
-        stat_var_dict: A dictionary with property: value of the statistical
-          variable as key-value pairs.
-        ignore_props: A list of properties to ignore from stat_var_dict when
-          generating the dcid. This list of ignore_props will be added to the
-          default set of properties that are ignored. The ignore_props can be
-          used to account for dependent properties to ignore when generating
-          the dcid. For example in the following statVar,
-          {
-            populationType: Person
-            measuredProperty: count
-            statType: measuredValue
-            healthInsurance: NoHealthInsurance
-            armedForceStatus: Civilian
-            institutionalization: USC_NonInstitutionalized
-          }
-          since the healthInsurance property indicates they are Civilian and
-          USC_NonInstitutionalized, ignore_props can be the list
-          ['armedForceStatus', 'institutionalization']. During the dcid
-          generation process, these properties will not be considered.
-
-    Returns:
-        A string representing the dcid of the statistical variable.
-
-    Caveats:
-        1. Currently, there is no support for renaming ICD10 cause of death
-             values and DEA drug names.
-        2. MeasuredProp=InsuredUnemploymentRate is not changed to
-             Rate_InsuredUnemployment.
-        3. The generated dcids can get too long due to the large number of
-             constraint props. In such cases, manual generation or the
-             ignore_props arg can be used to exclude a few props from the
-             generation process. It is recommended to limit the length of
-             statvar dcids to 80 characters or less.
-        4. This function does not differentiate between property names and only
-             uses the values to generate the dcid. Two props having the same
-             value, say p1=fuel, v1=Coal and p2=energy, v2=Coal will result in
-             the same dcid. The _PREPEND_APPEND_REPLACE_MAP can be modified to
-             disambiguate in this case.
-    """
+  The generated dcid will follow the pattern
+  <statType>_<measuredProp>_<populationType>_<constraintVal1>_<constraintVal2>
+
+  1. measurementQualifier is added as a prefix to the dcid.
+  2. statType is included when it is not measuredValue.
+  3. measurementDenominator is added as a suffix to the dcid.
+  4. Constraints are sorted alphabetically based on the prop and values are
+       added to the dcid.
+  5. Existing dcids may not follow the above conventions. The _LEGACY_MAP maps
+       generated dcids to their existing dcid.
+  6. NAICS and SOC codes are replaced with their industry and occupation names
+       respectively. See _NAICS_MAP and util/soc_codes_names.py for the
+       mapping.
+  7. Boolean constraints are replaced by their populations. For example,
+       p=isInternetUser and v=True/False becomes v=isInternetUser/
+       notInternetUser. See _BOOLEAN_PROPS for the properties that are
+       considered for this renaming.
+  8. Quantities and Quantity Ranges are changed into a name to be used in the
+       dcid. For example p=age and v=[10 20 Years] becomes v=10To20Years.
+  9. Certain variables have text prepended or appended to their constraints to
+       improve readability. See _PREPEND_APPEND_REPLACE_MAP for more details.
+
+  Args:
+      stat_var_dict: A dictionary with property: value of the statistical
+        variable as key-value pairs.
+      ignore_props: A list of properties to ignore from stat_var_dict when
+        generating the dcid. This list of ignore_props will be added to the
+        default set of properties that are ignored. The ignore_props can be used
+        to account for dependent properties to ignore when generating the dcid.
+        For example in the following statVar, {
+          populationType: Person
+          measuredProperty: count
+          statType: measuredValue
+          healthInsurance: NoHealthInsurance
+          armedForceStatus: Civilian
+          institutionalization: USC_NonInstitutionalized } since the
+            healthInsurance property indicates they are Civilian and
+            USC_NonInstitutionalized, ignore_props can be the list
+            ['armedForceStatus', 'institutionalization']. During the dcid
+            generation process, these properties will not be considered.
+
+  Returns:
+      A string representing the dcid of the statistical variable.
+
+  Caveats:
+      1. Currently, there is no support for renaming ICD10 cause of death
+           values and DEA drug names.
+      2. MeasuredProp=InsuredUnemploymentRate is not changed to
+           Rate_InsuredUnemployment.
+      3. The generated dcids can get too long due to the large number of
+           constraint props. In such cases, manual generation or the
+           ignore_props arg can be used to exclude a few props from the
+           generation process. It is recommended to limit the length of
+           statvar dcids to 80 characters or less.
+      4. This function does not differentiate between property names and only
+           uses the values to generate the dcid. Two props having the same
+           value, say p1=fuel, v1=Coal and p2=energy, v2=Coal will result in
+           the same dcid. The _PREPEND_APPEND_REPLACE_MAP can be modified to
+           disambiguate in this case.
+  """
 
     # TODO: Renaming cause of death properties
     # TODO: Renaming DEA drug names
@@ -693,7 +733,6 @@ def add_prop_to_list(prop: str, svd: dict, dcid_list: list):
 
     if denominator_suffix:
         dcid_list.append(denominator_suffix)
-
     dcid = '_'.join(dcid_list)
     dcid = _LEGACY_MAP.get(dcid, dcid)
     return dcid