Merge pull request #180 from adjtomo/devel

SeisFlows version 2.3.0
adjtomo · Jan 18, 2024 · 6cf7e62 · 6cf7e62
2 parents 3720e0b + 90e4121
commit 6cf7e62
Show file tree

Hide file tree

Showing 22 changed files with 892 additions and 91 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,5 +1,10 @@
 version: 2
 
+build:
+    os: "ubuntu-22.04"
+    tools:
+        python: "mambaforge-22.9"
+
 sphinx:
     builder: html
     configuration: docs/conf.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,3 +24,36 @@
 - Removes GitHub links in dependencies in favor of PyPi packages
 - Fixes broken Pyatoa import statements
 - Pyadjoint misfit functions are now listed properly in the Pyaflowa docstring- 
+
+
+## v2.3.0
+A collection of bugfixes and feature improvement (big thanks to @evcano for major PR #168)
+
+- Improved code organization for readability 
+- Allow acoustic domain and pressure seismograms in SPECFEM2D (#164)
+- Improved workflow feature `export_residuals` (#168)
+- Allow workflow toggling of SPECFEM parameter `SAVE_FORWARD_ARRAYS`
+- System support for HPC Wisteria (U. Tokyo) (#176)
+	- Makes the location of the `submit` and `run` scripts a System Class 
+	  variable which can be overwritten. 
+	- Adds `system.Fujitsu` as a generalized System class for HPCs using the 
+	  Fujitsu workload manager
+	- Adds `system.Wisteria` for System interactions with HPC Wisteria
+	- Custom run and submit scripts for Wisteria hardcoded to RSCGRP RC58, not 
+	  generalized at the moment
+
+### Bugfixes
+- Broken symlinks for Cluster runscript locations when SeisFlows is 
+  installed via Pip (#162), added Manifest.in file to fix this.
+- Max workers in concurrent jobs was set with the wrong variable (#160), replaced iwth correct variable
+- Bracketing line search allows non-passing models to pass line search under certain criteria, fixed.
+- Solver directory intialization was too general when searching for 
+  source files (#169), leading to an incorrect number of source files being detected, process now only looks for sources files that start with source prefix (e.g., CMTSOLUTION, FORCESOLUTION)
+- Residuals files were sometimes written incorrectly (#168) which lead to broken file reading. Residuals files are now written out at a more granular level to avoid multiple processes trying to write to one file
+- Residuals files were appended to by all processes during line search, leading to incorrect misfit values (#168); fixed by above bugfix.
+- Updated trial model was not exposed to line search after step count > 1   (#168), fixed
+- Incorrect step lengths defined when resuming workflow (#168), fixed
+- Exported synthetic traces were overwritten each iteration (#168). They are now tagged by iteration + step count to avoid overwriting
+- output_optim.txt was not writing the correct misfit values for each 
+  iteration, fixed
+
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+# MANIFEST.in
+
+include seisflows/seisflows/system/runscripts/run
+include seisflows/seisflows/system/runscripts/submit
diff --git a/environment.yml b/environment.yml
@@ -1,12 +1,13 @@
 name: seisflows
 channels:
     - conda-forge
-    - defaults
 dependencies:
     - pip
     - obspy
     - cartopy
     - pyyaml
     - pip:
+      - pyatoa<=0.2.2
+      - pysep-adjtomo<=0.4.1
       - -e .
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,8 @@ dependencies = [
     "pyyaml",
 	"IPython",
 	"dill",
-	"pyatoa>=0.2.2",
+	"pyatoa<=0.2.2",
+    "pysep-adjtomo<=0.4.1",
 ]
 
 [project.optional-dependencies]

diff --git a/seisflows/optimize/gradient.py b/seisflows/optimize/gradient.py
@@ -247,7 +247,9 @@ def checkpoint(self):
                         step_lens=self._line_search.step_lens,
                         gtg=self._line_search.gtg,
                         gtp=self._line_search.gtp,
-                        step_count=self._line_search.step_count)
+                        step_count=self._line_search.step_count,
+                        step_len_max=self._line_search.step_len_max
+                       )
 
         np.savez(file=self.path._checkpoint, **dict_out)  # NOQA
 
@@ -272,6 +274,7 @@ def load_checkpoint(self):
             self._line_search.gtg = list(dict_in["gtg"])
             self._line_search.gtp = list(dict_in["gtp"])
             self._line_search.step_count = int(dict_in["step_count"])
+            self._line_search.step_len_max = float(dict_in["step_len_max"])
         else:
             logger.info("no optimization checkpoint found, assuming first run")
             self.checkpoint()
@@ -364,13 +367,24 @@ def initialize_search(self):
 
         return m_try, alpha
 
+    def increment_step_count(self):
+        """
+        Convenience function to increment line search step count by 1. This is
+        wrapped in a function to keep things explicit, rather than calling +=1 
+        randomly in script. We are also accessing a private member of the class
+        so better to have a public function take care of incrementing.
+        """
+        self._line_search.step_count += 1
+        logger.info(f"step count incremented -> {self._line_search.step_count}")
+
     def update_line_search(self):
         """
         Updates line search status and step length after a forward simulation
         has been run and misfit calculated. Checks misfit against line search
         history to see if the line search has been completed.
 
         .. note::
+
             This is a bit confusing as it calculates the step length `alpha` for
             the NEXT line search step, while storing the `alpha` value that
             was calculated from the LAST line search step. This is because we
@@ -382,6 +396,7 @@ def update_line_search(self):
         and creating a new trial model (m_try).
 
         .. note:
+
             Available status returns are:
             'TRY': try/re-try the line search as conditions have not been met
             'PASS': line search was successful, you can terminate the search
@@ -396,7 +411,6 @@ def update_line_search(self):
         f_try = self.load_vector("f_try")  # misfit for the trial model
 
         # Update the line search with a new step length and misfit value
-        self._line_search.step_count += 1
         self._line_search.update_search_history(step_len=alpha_try,
                                                 func_val=f_try)
 
@@ -518,18 +532,14 @@ def _write_stats(self):
         if they should be retained.
 
         .. note::
+
             This CSV file can be easily read and plotted using np.genfromtxt
             >>> np.genfromtxt("optim_stats.txt", delimiter=",", names=True, \
                               dtype=None)
         """
         logger.info(f"writing optimization stats")
-        # First time, write header information
-        if not os.path.exists(self.path._stats_file):
-            _head = ("step_count,step_length,gradient_norm_L1,gradient_norm_L2,"
-                     "misfit,if_restarted,slope,theta\n")
-            with open(self.path._stats_file, "w") as f:
-                f.write(_head)
 
+        # Gather required information from line search parameters
         g = self.load_vector("g_new")
         p = self.load_vector("p_new")
         x, f, *_ = self._line_search.get_search_history()
@@ -540,23 +550,39 @@ def _write_stats(self):
         # factor = -1 * dot(g.vector, g.vector)
         # factor = factor ** -0.5 * (f[1] - f[0]) / (x[1] - x[0])
 
+        # First time, write header information and start model misfit. Note that
+        # most of the statistics do not apply to the starting model so they
+        # are set to 0 by default
+        if not os.path.exists(self.path._stats_file):
+            _head = ("step_count,step_length,grad_norm_L1,grad_norm_L2,"
+                     "misfit,if_restarted,slope,theta\n")
+            step_count = 0
+            step_length = x[0]
+            grad_norm_L1 = 0
+            grad_norm_L2 = 0
+            misfit = f[0]
+            slope = 0
+            theta = 0
+            _str = (f"{step_count:0>2},{step_length:6.3E},{grad_norm_L1:6.3E},"
+                    f"{grad_norm_L2:6.3E},{misfit:6.3E},{int(self._restarted)},"
+                    f"{slope:6.3E},{theta:6.3E}\n")
+            with open(self.path._stats_file, "w") as f_:
+                f_.write(_head)
+                f_.write(_str)
+
+        # Gather/calculate information from a given line search run
+        step_count = self._line_search.step_count
+        step_length = x[f.argmin()]
+        misfit = f[f.argmin()]
+
         grad_norm_L1 = np.linalg.norm(g.vector, 1)
         grad_norm_L2 = np.linalg.norm(g.vector, 2)
 
-        misfit = f[0]
         slope = (f[1] - f[0]) / (x[1] - x[0])
-        step_count = self._line_search.step_count
-        step_length = x[f.argmin()]
         theta = 180. * np.pi ** -1 * angle(p.vector, -1 * g.vector)
 
-        with open(self.path._stats_file, "a") as f:
-            f.write(# f"{factor:6.3E},"
-                    f"{step_count:0>2},"
-                    f"{step_length:6.3E},"
-                    f"{grad_norm_L1:6.3E},"
-                    f"{grad_norm_L2:6.3E},"
-                    f"{misfit:6.3E},"
-                    f"{int(self._restarted)},"
-                    f"{slope:6.3E},"
-                    f"{theta:6.3E}\n"
-                    )
+        _str = (f"{step_count:0>2},{step_length:6.3E},{grad_norm_L1:6.3E},"
+                f"{grad_norm_L2:6.3E},{misfit:6.3E},{int(self._restarted)},"
+                f"{slope:6.3E},{theta:6.3E}\n")
+        with open(self.path._stats_file, "a") as f_:
+            f_.write(_str)
diff --git a/seisflows/plugins/line_search/bracket.py b/seisflows/plugins/line_search/bracket.py
@@ -201,22 +201,19 @@ def calculate_step_length(self):
             alpha = None
             status = "FAIL"
 
-        # Apply optional step length safeguard
-        if alpha is not None:
+        # Apply optional step length safeguard to step length
+        if status == "TRY":
             if alpha > self.step_len_max and step_count == 0:
                 alpha = 0.618034 * self.step_len_max
                 logger.info(f"try: applying initial step length "
                             f"safegaurd as alpha has exceeded maximum step "
                             f"length, alpha_new={alpha:.2E}")
-                status = "TRY"
             # Stop because safeguard prevents us from going further
-            # TODO Why is this passing? should status not be carried over?
             elif alpha > self.step_len_max:
                 alpha = self.step_len_max
-                logger.info(f"try: applying initial step length "
-                            f"safegaurd as alpha has exceeded maximum step "
-                            f"length, alpha_new={alpha:.2E}")
-                status = "PASS"
+                logger.info(f"try: applying step length safegaurd as alpha has "
+                            f"exceeded maximum allowable step length, "
+                            f"alpha_new={alpha:.2E}")
 
         return alpha, status
 

diff --git a/seisflows/preprocess/default.py b/seisflows/preprocess/default.py
@@ -34,7 +34,7 @@ class Default:
     :type unit_output: str
     :param unit_output: Data units. Must match the synthetic output of
         external solver. Available: ['DISP': displacement, 'VEL': velocity,
-        'ACC': acceleration]
+        'ACC': acceleration, 'PRE': pressure]
     :type misfit: str
     :param misfit: misfit function for waveform comparisons. For available
         see seisflows.plugins.preprocess.misfit
@@ -145,7 +145,7 @@ def __init__(self, syn_data_format="ascii", obs_data_format="ascii",
         self._syn_acceptable_data_formats = ["SU", "ASCII"]
         self._obs_acceptable_data_formats = ["SU", "ASCII", "SAC"]
 
-        self._acceptable_unit_output = ["DISP", "VEL", "ACC"]
+        self._acceptable_unit_output = ["DISP", "VEL", "ACC", "PRE"]
 
         # Misfits and adjoint sources are defined by the available functions
         # in each of these plugin files. Drop hidden variables from dir()
@@ -408,7 +408,8 @@ def _rename_as_adjoint_source(self, fid):
 
     def _setup_quantify_misfit(self, source_name):
         """
-        Gather waveforms from the Solver scratch directory and
+        Gather waveforms from the Solver scratch directory which will be used
+        for generating adjoint sources
         """
         source_name = source_name or self._source_names[get_task_id()]
 
@@ -424,7 +425,7 @@ def _setup_quantify_misfit(self, source_name):
         # verify observed traces format
         obs_ext = list(set([os.path.splitext(x)[-1] for x in observed]))
 
-        if self.obs_data_format == "ASCII":
+        if self.obs_data_format.upper() == "ASCII":
             obs_ext_ok = obs_ext[0].upper() == ".ASCII" or \
                          obs_ext[0].upper() == f".SEM{self.unit_output[0]}"
         else:
@@ -477,8 +478,8 @@ def _setup_quantify_misfit(self, source_name):
         return observed, synthetic
 
     def quantify_misfit(self, source_name=None, save_residuals=None,
-                        save_adjsrcs=None, iteration=1, step_count=0,
-                        **kwargs):
+                        export_residuals=None, save_adjsrcs=None, iteration=1,
+                        step_count=0, **kwargs):
         """
         Prepares solver for gradient evaluation by writing residuals and
         adjoint traces. Meant to be called by solver.eval_func().
@@ -551,6 +552,12 @@ def quantify_misfit(self, source_name=None, save_residuals=None,
         if save_adjsrcs and self._generate_adjsrc:
             self._check_adjoint_traces(source_name, save_adjsrcs, synthetic)
 
+        # Exporting residuals to disk (output/) for more permanent storage
+        if export_residuals:
+            if not os.path.exists(export_residuals):
+                unix.mkdir(export_residuals)
+            unix.cp(src=save_residuals, dst=export_residuals)
+
     def finalize(self):
         """Teardown procedures for the default preprocessing class"""
         pass

diff --git a/seisflows/preprocess/pyaflowa.py b/seisflows/preprocess/pyaflowa.py
@@ -110,7 +110,7 @@ def __init__(self, min_period=1., max_period=10., filter_corners=4,
                  workdir=os.getcwd(), path_preprocess=None,
                  path_solver=None, path_specfem_data=None, path_data=None,
                  path_output=None, syn_data_format="ascii",
-                 data_case="data", components="ZNE",
+                 data_case="data", components=None,
                  start=None, ntask=1, nproc=1, source_prefix=None,
                  **kwargs):
         """
@@ -133,8 +133,11 @@ def __init__(self, min_period=1., max_period=10., filter_corners=4,
             a target model provided in `path_model_true`. If None, workflow will
             not attempt to generate data.
         :type components: str
-        :param components: components to consider and tag data with. Should be
-            string of letters such as 'RTZ'
+        :param components: components to search for synthetic data with. None by
+            default which uses a wildcard when searching for synthetics. If
+            provided, User only wants to use a subset of components generated by
+            SPECFEM. In that case, `components` should be string of letters such
+            as 'ZN' (for up and north components)
         :type workdir: str
         :param workdir: working directory in which to look for data and store
             results. Defaults to current working directory
@@ -188,7 +191,11 @@ def __init__(self, min_period=1., max_period=10., filter_corners=4,
         # so `seisflows configure` doesn't attribute these to preprocess.
         self._syn_data_format = syn_data_format.upper()
         self._data_case = data_case.lower()
-        self._components = components
+        if components is not None:
+            self._components = list(components)  # e.g. 'RTZ' -> ['R', 'T', 'Z']
+        else:
+            self._components = components
+
         self._start = start
         self._ntask = ntask
         self._nproc = nproc
@@ -263,7 +270,7 @@ def setup(self):
             rotate=self.rotate, pyflex_preset=self.pyflex_preset,
             fix_windows=self.fix_windows, adj_src_type=self.adj_src_type,
             log_level=self.pyatoa_log_level, unit_output=self.unit_output,
-            component_list=list(self._components), save_to_ds=False,
+            component_list=self._components, save_to_ds=False,
             st_obs_type=st_obs_type,
             paths={"waveforms": self.path["_waveforms"] or [],
                    "responses": self.path["_responses"] or [],
@@ -294,8 +301,8 @@ def ftag(config):
         return f"{config.event_id}_{config.iter_tag}_{config.step_tag}"
 
     def quantify_misfit(self, source_name=None, save_residuals=None,
-                        save_adjsrcs=None, iteration=1, step_count=0,
-                        parallel=False, **kwargs):
+                        export_residuals=None, save_adjsrcs=None, iteration=1,
+                        step_count=0, parallel=False, **kwargs):
         """
         Main processing function to be called by Workflow module. Generates
         total misfit and adjoint sources for a given event with name