Cahaba Codebase Cleanup - Small Items

General repository cleanup, made memory-profiling an optional flag, API's release feature now saves outputs. - Remove Dockerfile.prod, rename Dockerfile.dev to just Dockerfile, and remove ``.dockerignore`. - Clean up Dockerfile and remove any unused* packages or variables. - Remove any unused* Python packages from the Pipfile. - Move the CHANGELOG.md, SECURITY.md, and TERMS.md files to the /docs folder. - Remove any unused* scripts in the /tools and /src folders. - Move tools/preprocess scripts into tools/. - Ensure all scripts in the /src folder have their code in functions and are being called via a __main__ function (This will help with implementing memory profiling fully). - Changed memory-profiling to be an option flag -m for fim_run.sh. - Updated FIM API to save all outputs during a "release" job. This resolves #432, resolves #426, and resolves #434.
NOAA-OWP · Aug 18, 2021 · 8949dc3 · 8949dc3
1 parent 7aa4ab9
commit 8949dc3
Show file tree

Hide file tree

Showing 59 changed files with 598 additions and 1,371 deletions.
diff --git a/.dockerignore b/.dockerignore
diff --git a/Dockerfile.dev → Dockerfile b/Dockerfile.dev → Dockerfile
@@ -6,17 +6,16 @@ ARG projectDir=/foss_fim
 ARG depDir=/dependencies
 ARG taudemVersion=98137bb6541a0d0077a9c95becfed4e56d0aa0ac
 ARG taudemVersion2=81f7a07cdd3721617a30ee4e087804fddbcffa88
-ENV DEBIAN_FRONTEND noninteractive
 ENV taudemDir=$depDir/taudem/bin
 ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin
 
-RUN apt-get update && apt-get install -y git  && rm -rf /var/lib/apt/lists/* 
+RUN apt-get update && apt-get install -y git  && rm -rf /var/lib/apt/lists/*
 
 RUN git clone https://github.com/dtarb/taudem.git
 RUN git clone https://github.com/fernandoa123/cybergis-toolkit.git taudem_accelerated_flowDirections
 
 RUN apt-get update --fix-missing && apt-get install -y cmake mpich \
-    libgtest-dev libboost-test-dev libnetcdf-dev && rm -rf /var/lib/apt/lists/* 
+    libgtest-dev libboost-test-dev libnetcdf-dev && rm -rf /var/lib/apt/lists/*
 
 ## Compile Main taudem repo ##
 RUN mkdir -p taudem/bin
@@ -53,7 +52,7 @@ ARG dataDir=/data
 ARG projectDir=/foss_fim
 ARG depDir=/dependencies
 ENV inputDataDir=$dataDir/inputs
-ENV outputDataDir=$dataDir/outputs 
+ENV outputDataDir=$dataDir/outputs
 ENV srcDir=$projectDir/src
 ENV taudemDir=$depDir/taudem/bin
 ENV taudemDir2=$depDir/taudem_accelerated_flowDirections/taudem/build/bin

diff --git a/Dockerfile.prod b/Dockerfile.prod
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -132,7 +132,7 @@ Please see the issue tracker on GitHub for known issues and for getting help.
 NOAA's National Water Center welcomes anyone to contribute to the Cahaba repository to improve flood inundation mapping capabilities. Please contact Brad Bates ([email protected]) or Fernando Salas ([email protected]) to get started.
 
 ## Open Source Licensing Info
-1. [TERMS](TERMS.md)
+1. [TERMS](docs/TERMS.md)
 2. [LICENSE](LICENSE)
 
 ## Credits and References

diff --git a/api/node/updater/updater.py b/api/node/updater/updater.py
@@ -75,7 +75,7 @@ def update_loop():
                 outputs_path = f"/data/outputs/{current_jobs[job_name]['nice_name']}"
                 if os.path.isdir(outputs_path):
                     shutil.rmtree(outputs_path)
-                
+
                 jobs_to_delete.append(job_name)
 
             active_statuses = [
@@ -113,7 +113,7 @@ def update_loop():
                             total_active_cores += current_jobs[j]['parallel_jobs'] * 5
                         else:
                             total_active_cores += current_jobs[j]['parallel_jobs']
-                
+
                 # Machine has enough resources to run a new job
                 potential_active_cores = 0
                 if current_jobs[job_name]['hucs_type'] == '6':
@@ -170,16 +170,21 @@ def update_loop():
                     current_jobs[job_name]['total_output_files_length'] = len(current_jobs[job_name]['output_files_saved'].keys())
                     current_jobs[job_name]['status'] = 'Ready to Save File'
                 elif current_jobs[job_name]['job_type'] == 'release':
+                    # Move outputs to previous_fim and set them to be copied to the dev machine
+                    if os.path.isdir(f"/data/previous_fim/{current_jobs[job_name]['nice_name']}"):
+                        shutil.rmtree(f"/data/previous_fim/{current_jobs[job_name]['nice_name']}")
+                    if os.path.isdir(f"/data/outputs/{current_jobs[job_name]['nice_name']}"): shutil.move(f"/data/outputs/{current_jobs[job_name]['nice_name']}", '/data/previous_fim')
+                    for path, folders, files in os.walk(f"/data/previous_fim/{current_jobs[job_name]['nice_name']}"):
+                        for file in files:
+                            current_jobs[job_name]['output_files_saved'][os.path.join(path, file)] = 0
+                    current_jobs[job_name]['total_output_files_length'] = len(current_jobs[job_name]['output_files_saved'].keys())
                     current_jobs[job_name]['status'] = 'Ready for Synthesize Test Cases'
 
             if current_jobs[job_name]['status'] == 'Ready for Synthesize Test Cases':
                 job_name = current_jobs[job_name]['job_name']
                 nice_name = current_jobs[job_name]['nice_name']
                 parallel_jobs = current_jobs[job_name]['parallel_jobs']
 
-                if os.path.isdir(f"/data/previous_fim/{nice_name}"):
-                    shutil.rmtree(f"/data/previous_fim/{nice_name}")
-                if os.path.isdir(f"/data/outputs/{nice_name}"): shutil.move(f"/data/outputs/{nice_name}", '/data/previous_fim')
                 # Kick off the new job as a docker container to run eval metrics
                 print(f"docker run -d --name {job_name} -v {DATA_PATH}:/data/ -v {DATA_PATH}temp/{job_name}/:/foss_fim -w /foss_fim/tools {DOCKER_IMAGE_PATH} /foss_fim/tools/synthesize_test_cases.py -c PREV --fim-version {nice_name} --job-number {parallel_jobs} -m /data/test_cases/metrics_library/all_official_versions.csv")
                 subprocess.call(f"docker run -d --name {job_name} -v {DATA_PATH}:/data/ -v {DATA_PATH}temp/{job_name}/:/foss_fim -w /foss_fim/tools {DOCKER_IMAGE_PATH} /foss_fim/tools/synthesize_test_cases.py -c PREV --fim-version {nice_name} --job-number {parallel_jobs} -m /data/test_cases/metrics_library/all_official_versions.csv", shell=True)
@@ -205,7 +210,6 @@ def update_loop():
                 subprocess.call(f"docker container rm {job_name}", shell=True)
 
                 current_jobs[job_name]['output_files_saved']['/data/test_cases/metrics_library/all_official_versions.csv'] = 0
-                current_jobs[job_name]['output_files_saved'][f"/data/previous_fim/{current_jobs[job_name]['nice_name']}/logs/docker.log"] = 0
                 current_jobs[job_name]['output_files_saved'][f"/data/previous_fim/{current_jobs[job_name]['nice_name']}/logs/synthesize_test_cases_docker.log"] = 0
                 current_jobs[job_name]['total_output_files_length'] = len(current_jobs[job_name]['output_files_saved'].keys())
                 current_jobs[job_name]['status'] = 'Ready for Eval Plots'
@@ -289,7 +293,7 @@ def update_loop():
                 current_jobs[job_name]['output_files_saved'][f"/data/previous_fim/{current_jobs[job_name]['nice_name']}/logs/generate_categorical_fim.log"] = 0
                 current_jobs[job_name]['total_output_files_length'] = len(current_jobs[job_name]['output_files_saved'].keys())
                 current_jobs[job_name]['status'] = 'Ready to Save File'
-    
+
             # Trigger connector to transmit the outputs to the output_handler
             # If the output_handler is offline, it will keep retrying until the output_handler is online
             if current_jobs[job_name]['status'] == 'Ready to Save File' and (shared_data['current_saving_job'] == '' or shared_data['current_saving_job'] == current_jobs[job_name]):
@@ -303,15 +307,15 @@ def update_loop():
                         output_to_save = {'path': path, 'chunk_index': current_jobs[job_name]['output_files_saved'][path]}
 
                 if output_to_save != {}:
-                    if shared_data['connected']: 
+                    if shared_data['connected']:
                         sio.emit('ready_for_output_handler', {
                             'nice_name': current_jobs[job_name]['nice_name'],
                             'job_name': job_name,
                             'path': output_to_save['path'],
                             'chunk_index': output_to_save['chunk_index']
                         })
                 current_jobs[job_name]['status'] = 'Saving File'
-            
+
             # Once the output_handler is done getting the outputs and the connector deletes the temp repo source,
             # mark as completed
             if current_jobs[job_name]['status'] == 'Saving File':
@@ -335,8 +339,8 @@ def update_loop():
                         if os.path.isdir(destination):
                             shutil.rmtree(destination)
                         if os.path.isdir(f"{outputs_path}/aggregate_fim_outputs"): shutil.move(f"{outputs_path}/aggregate_fim_outputs", destination)
-                        if os.path.isdir(f"{outputs_path}/logs"): shutil.move(f"{outputs_path}/logs", f"{destination}/logs") 
-                        if os.path.isdir(f"/data/catfim/{current_jobs[job_name]['nice_name']}"): shutil.move(f"/data/catfim/{current_jobs[job_name]['nice_name']}", f"{destination}/catfim") 
+                        if os.path.isdir(f"{outputs_path}/logs"): shutil.move(f"{outputs_path}/logs", f"{destination}/logs")
+                        if os.path.isdir(f"/data/catfim/{current_jobs[job_name]['nice_name']}"): shutil.move(f"/data/catfim/{current_jobs[job_name]['nice_name']}", f"{destination}/catfim")
 
                     if os.path.isdir(outputs_path):
                         shutil.rmtree(outputs_path)
@@ -349,7 +353,7 @@ def update_loop():
                             pass
 
                     current_jobs[job_name]['status'] = 'Completed' if current_jobs[job_name]['exit_code'] == 0 else 'Error'
-                
+
                     shared_data['current_saving_job'] = ''
                     current_jobs[job_name]['is_actively_saving'] = False
                     print(f"{job_name} completed")
@@ -383,7 +387,7 @@ def update_loop():
         if shared_data['connected']: sio.emit('update', {'jobUpdates': job_updates, 'presetsList': presets_list})
         with open('/data/outputs/current_jobs.json.temp', 'w') as f:
             json.dump(current_jobs, f)
-            shutil.move('/data/outputs/current_jobs.json.temp', '/data/outputs/current_jobs.json') 
+            shutil.move('/data/outputs/current_jobs.json.temp', '/data/outputs/current_jobs.json')
 
 sio = socketio.Client()
 
@@ -523,4 +527,4 @@ def ws_file_saved(data):
     current_jobs[job_name]['status'] = 'Ready to Save File'
 
 sio.connect('http://fim_node_connector:6000/')
-update_loop()
+update_loop()
diff --git a/config/params_template.env b/config/params_template.env
@@ -7,7 +7,7 @@ export wbd_buffer=5000
 export thalweg_lateral_elev_threshold=3
 
 #### geospatial parameters ####
-export max_split_distance_meters=1500
+export max_split_distance_meters=2000
 export ms_buffer_dist=7000
 export lakes_buffer_dist_meters=20
 

diff --git a/CHANGELOG.md → docs/CHANGELOG.md b/CHANGELOG.md → docs/CHANGELOG.md
@@ -2,6 +2,23 @@ All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 <br/><br/>
 
+## v3.0.21.0 - 2021-08-18 - [PR #433](https://github.com/NOAA-OWP/cahaba/pull/433)
+
+General repository cleanup, made memory-profiling an optional flag, API's release feature now saves outputs.
+
+## Changes
+- Remove `Dockerfile.prod`, rename `Dockerfile.dev` to just `Dockerfile`, and remove ``.dockerignore`.
+- Clean up `Dockerfile` and remove any unused* packages or variables.
+- Remove any unused* Python packages from the `Pipfile`.
+- Move the `CHANGELOG.md`, `SECURITY.md`, and `TERMS.md` files to the `/docs` folder.
+- Remove any unused* scripts in the `/tools` and `/src` folders.
+- Move `tools/preprocess` scripts into `tools/`.
+- Ensure all scripts in the `/src` folder have their code in functions and are being called via a `__main__` function (This will help with implementing memory profiling fully).
+- Changed memory-profiling to be an option flag `-m` for `fim_run.sh`.
+- Updated FIM API to save all outputs during a "release" job.
+
+<br/><br/>
+
 ## v3.0.20.2 - 2021-08-13 - [PR #443](https://github.com/NOAA-OWP/cahaba/pull/443)
 
 This merge modifies `clip_vectors_to_wbd.py` to check for relevant input data.

diff --git a/SECURITY.md → docs/SECURITY.md b/SECURITY.md → docs/SECURITY.md
diff --git a/TERMS.md → docs/TERMS.md b/TERMS.md → docs/TERMS.md
diff --git a/fim_run.sh b/fim_run.sh
@@ -21,6 +21,7 @@ usage ()
     echo '  -w/--whitelist  : list of files to save in a production run in addition to final inundation outputs'
     echo '                     ex: file1.tif,file2.json,file3.csv'
     echo '  -v/--viz        : compute post-processing on outputs to be used in viz'
+    echo '  -m/--mem        : enable memory profiling'
     exit
 }
 
@@ -69,6 +70,9 @@ in
     -v|--viz)
         viz=1
         ;;
+    -m|--mem)
+        mem=1
+        ;;
     *) ;;
     esac
     shift
@@ -107,6 +111,7 @@ export extent=$extent
 export production=$production
 export whitelist=$whitelist
 export viz=$viz
+export mem=$mem
 logFile=$outputRunDataDir/logs/summary.log
 
 ## Define inputs

diff --git a/src/acquire_and_preprocess_inputs.py b/src/acquire_and_preprocess_inputs.py
@@ -19,26 +19,27 @@
                                     NHD_VECTOR_EXTRACTION_SUFFIX,
                                     PREP_PROJECTION,
                                     WBD_NATIONAL_URL,
-                                    FIM_ID,
-                                    OVERWRITE_WBD,
-                                    OVERWRITE_NHD,
-                                    OVERWRITE_ALL)
+                                    FIM_ID
+                                    )
 
-from utils.shared_functions import (pull_file, run_system_command,
-                                    subset_wbd_gpkg, delete_file,
+from utils.shared_functions import (pull_file, 
+                                    run_system_command,
+                                    delete_file,
                                     getDriver)
 
 NHDPLUS_VECTORS_DIRNAME = 'nhdplus_vectors'
 NHDPLUS_RASTERS_DIRNAME = 'nhdplus_rasters'
 NWM_HYDROFABRIC_DIRNAME = 'nwm_hydrofabric'
 NWM_FILE_TO_SUBSET_WITH = 'nwm_flows.gpkg'
 
+
 def subset_wbd_to_nwm_domain(wbd,nwm_file_to_use):
 
     intersecting_indices = [not (gpd.read_file(nwm_file_to_use,mask=b).empty) for b in wbd.geometry]
 
     return(wbd[intersecting_indices])
 
+
 def pull_and_prepare_wbd(path_to_saved_data_parent_dir,nwm_dir_name,nwm_file_to_use,overwrite_wbd,num_workers):
     """
     This helper function pulls and unzips Watershed Boundary Dataset (WBD) data. It uses the WBD URL defined by WBD_NATIONAL_URL.
@@ -125,6 +126,7 @@ def pull_and_prepare_wbd(path_to_saved_data_parent_dir,nwm_dir_name,nwm_file_to_
 
     return(wbd_directory)
 
+
 def pull_and_prepare_nwm_hydrofabric(path_to_saved_data_parent_dir, path_to_preinputs_dir,num_workers):
     """
     This helper function pulls and unzips NWM hydrofabric data. It uses the NWM hydrofabric URL defined by NWM_HYDROFABRIC_URL.

diff --git a/src/add_crosswalk.py b/src/add_crosswalk.py
@@ -11,10 +11,11 @@
 # sys.path.append('/foss_fim/src')
 # sys.path.append('/foss_fim/config')
 from bathy_rc_adjust import bathy_rc_lookup
-from utils.shared_functions import getDriver
+from utils.shared_functions import getDriver, mem_profile
 from utils.shared_variables import FIM_ID
 
-@profile
+
+@mem_profile
 def add_crosswalk(input_catchments_fileName,input_flows_fileName,input_srcbase_fileName,input_bathy_fileName,output_bathy_fileName,output_bathy_streamorder_fileName,output_bathy_thalweg_fileName,output_bathy_xs_lookup_fileName,output_catchments_fileName,output_flows_fileName,output_src_fileName,output_src_json_fileName,output_crosswalk_fileName,output_hydro_table_fileName,input_huc_fileName,input_nwmflows_fileName,input_nwmcatras_fileName,mannings_n,input_nwmcat_fileName,extent,small_segments_filename,calibration_mode=False):
 
     input_catchments = gpd.read_file(input_catchments_fileName)
@@ -274,6 +275,7 @@ def add_crosswalk(input_catchments_fileName,input_flows_fileName,input_srcbase_f
         with open(output_src_json_fileName,'w') as f:
             json.dump(output_src_json,f,sort_keys=True,indent=2)
 
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='Crosswalk for MS/FR networks; calculate synthetic rating curves; update short rating curves')

diff --git a/src/adjust_headwater_streams.py b/src/adjust_headwater_streams.py
@@ -159,6 +159,7 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
 
         return nhd_streams, combined_pts
 
+
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser(description='adjust headwater stream geometery based on headwater start points')
@@ -173,6 +174,8 @@ def adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id):
 
     args = vars(parser.parse_args())
 
+    #TODO variables below are not defined
+
     adj_streams_gdf, adj_headwaters_gdf = adjust_headwaters(huc,nhd_streams,nwm_headwaters,nws_lids,headwater_id)
 
     if subset_nhd_streams_fileName is not None: