JuDFTteam · PhilippRue · Dec 2, 2022 · Dec 5, 2022 · Dec 6, 2022 · Dec 6, 2022
diff --git a/aiida_kkr/calculations/kkr.py b/aiida_kkr/calculations/kkr.py
@@ -257,21 +257,14 @@ def define(cls, spec):
         spec.default_output_node = 'output_parameters'
 
         # define exit codes, also used in parser
-        spec.exit_code(
-            301,
-            'ERROR_NO_OUTPUT_FILE',
-            message='KKR output file not found',
-        )
-        spec.exit_code(
-            302,
-            'ERROR_KKR_PARSING_FAILED',
-            message='KKR parser retuned an error',
-        )
-        spec.exit_code(
-            303,
-            'ERROR_NO_SHAPEFUN_FOUND',
-            message='Could not find shapefun from voronoi parent',
-        )
+        spec.exit_code(301, 'ERROR_NO_OUTPUT_FILE', message='KKR output file not found')
+        spec.exit_code(302, 'ERROR_NOT_ENOUGH_MEMORY', message='KkrCalculation needs more memory')
+        spec.exit_code(303, 'ERROR_TIME_LIMIT', message='KkrCalculation needs more runtime')
+        spec.exit_code(304, 'ERROR_KKR_PARSING_FAILED', message='KKR parser retuned an error')
+        spec.exit_code(305, 'ERROR_OPENING_OUTPUTS', message='Kkr parser could not open an output file')
+        spec.exit_code(306, 'ERROR_CALCULATION_FAILED', message='KkrCalculation failed for an unknown reason')
+        spec.exit_code(307, 'ERROR_NO_SHAPEFUN_FOUND', message='Could not find shapefun from voronoi parent')
+        spec.exit_code(308, 'ERROR_RLOG_TOO_SMALL', message='RLOG too small for Chebychev solver')
 
     def prepare_for_submission(self, tempfolder):
         """

diff --git a/aiida_kkr/calculations/kkrimp.py b/aiida_kkr/calculations/kkrimp.py
@@ -168,21 +168,19 @@ def define(cls, spec):
     Note: The length of the theta, phi and fix_dir lists have to be equal to the number of atoms in the impurity cluster.
 """
         )
-        spec.input(
-            'cleanup_outfiles',
-            valid_type=Bool,
-            required=False,
-            default=lambda: Bool(False),
-            help='Cleanup and compress output (works only in aiida-core<2.0 and breaks caching ability).'
-        )
 
         # define outputs
         spec.output('output_parameters', valid_type=Dict, required=True, help='results of the KKRimp calculation')
         spec.default_output_node = 'output_parameters'
         # define exit codes, also used in parser
         spec.exit_code(301, 'ERROR_NO_RETRIEVED_FOLDER', message='Retrieved folder of KKRimp calculation not found.')
         spec.exit_code(302, 'ERROR_PARSING_KKRIMPCALC', message='KKRimp parser returned an error.')
-        #TBD
+        spec.exit_code(303, 'ERROR_NO_OUTPUT_FILE', message='KKRimp output file not found')
+        spec.exit_code(304, 'ERROR_NOT_ENOUGH_MEMORY', message='KkrimpCalculation needs more memory')
+        spec.exit_code(305, 'ERROR_TIME_LIMIT', message='KkrimpCalculation needs more runtime')
+        spec.exit_code(306, 'ERROR_OPENING_OUTPUTS', message='Kkrimp parser could not open an output file')
+        spec.exit_code(307, 'ERROR_CALCULATION_FAILED', message='KkrimpCalculation failed for an unknown reason')
+        spec.exit_code(308, 'ERROR_RLOG_TOO_SMALL', message='RLOG too small for Chebychev solver')
 
     def prepare_for_submission(self, tempfolder):
         """

diff --git a/aiida_kkr/calculations/voro.py b/aiida_kkr/calculations/voro.py
@@ -81,7 +81,12 @@ def define(cls, spec):
         spec.default_output_node = 'output_parameters'
         # define exit codes, also used in parser
         spec.exit_code(301, 'ERROR_NO_OUTPUT_FILE', message='Voronoi output file not found')
-        spec.exit_code(302, 'ERROR_VORONOI_PARSING_FAILED', message='Voronoi parser retuned an error')
+        spec.exit_code(302, 'ERROR_NOT_ENOUGH_MEMORY', message='VoronoiCalculation needs more memory')
+        spec.exit_code(303, 'ERROR_TIME_LIMIT', message='VoronoiCalculation needs more runtime')
+        spec.exit_code(304, 'ERROR_VORONOI_PARSING_FAILED', message='Voronoi parser retuned an error')
+        spec.exit_code(305, 'ERROR_OPENING_OUTPUTS', message='Voronoi parser could not open an output file')
+        spec.exit_code(306, 'ERROR_CALCULATION_FAILED', message='VoronoiCalculation failed for an unknown reason')
+        spec.exit_code(307, 'ERROR_NACLSD_TOO_SMALL', message='NACLSD in VoronoiCalculation too small.')
 
     def prepare_for_submission(self, tempfolder):
         """Create the input files from the input nodes passed to this instance of the `CalcJob`.

diff --git a/aiida_kkr/parsers/kkr.py b/aiida_kkr/parsers/kkr.py
@@ -222,23 +222,64 @@ def parse(self, debug=False, **kwargs):
             msg = 'Automatically returned success=True for KKR importer although some parsing errors occurred'
             self.logger.warning(msg)
 
+        # return an exit code if parsing fails
         if not success:
+            # check error file
+            exit_code = self.check_error_file(out_folder)
+            if exit_code is not None:
+                return exit_code
+            # if nothing was returned we have a general parising failure
             return self.exit_codes.ERROR_KKR_PARSING_FAILED
-        else:  # cleanup after parsing (only if parsing was successful)
-            # cleanup only works below aiida-core v2.0
-            if int(aiida_core_version.split('.')[0]) < 2:
-                # delete completely parsed output files
-                self.remove_unnecessary_files()
-                # then (maybe) tar the output to save space
-                # TODO needs implementing (see kkrimp parser)
-
-    def remove_unnecessary_files(self):
-        """
-        Remove files that are not needed anymore after parsing
-        The information is completely parsed (i.e. in outdict of calculation)
-        and keeping the file would just be a duplication.
-        """
-        files_to_delete = [KkrCalculation._POTENTIAL, KkrCalculation._SHAPEFUN]
-        for fileid in files_to_delete:
-            if fileid in self.retrieved.list_object_names():
-                self.retrieved.delete_object(fileid, force=True)
+
+    def check_error_file(self, out_folder):
+        """Check if anything is in the error file and get some hints for error handler in restart workchain"""
+
+        # check if something was written to the error file
+        errorfile = self.node.attributes['scheduler_stderr']
+
+        if errorfile in out_folder.list_object_names():
+            # read
+            try:
+                with out_folder.open(errorfile, 'r') as efile:
+                    error_file_lines = efile.read()  # Note: read(), not readlines()
+            except OSError:
+                self.logger.error(f'Failed to open error file: {errorfile}.')
+                return self.exit_codes.ERROR_OPENING_OUTPUTS
+
+            # check lines in the errorfile
+            if error_file_lines:
+
+                if isinstance(error_file_lines, bytes):
+                    error_file_lines = error_file_lines.replace(b'\x00', b' ')
+                else:
+                    error_file_lines = error_file_lines.replace('\x00', ' ')
+
+                print(f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}')
+                self.logger.warning(
+                    f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}'
+                )
+
+                # check for some errors which we can fix automatically
+                if 'STOP Error creating newmesh!' in error_file_lines:
+                    return self.exit_codes.ERROR_RLOG_TOO_SMALL
+
+                # here we estimate how much walltime was available and consumed
+                try:
+                    time_avail_sec = self.node.attributes['last_job_info']['requested_wallclock_time_seconds']
+                    time_calculated = self.node.attributes['last_job_info']['wallclock_time_seconds']
+                    if 0.97 * time_avail_sec < time_calculated:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+                except KeyError:
+                    if 'TIME LIMIT' in error_file_lines.upper() or 'time limit' in error_file_lines:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+
+                # check for out of memory errors
+                OUT_OF_MEMORY_PHRASES = [
+                    'cgroup out-of-memory handler',
+                    'Out Of Memory',
+                ]
+                if any(phrase in error_file_lines for phrase in OUT_OF_MEMORY_PHRASES):
+                    return self.exit_codes.ERROR_NOT_ENOUGH_MEMORY
+
+                # Catch all exit code for an unknown failure
+                return self.exit_codes.ERROR_CALCULATION_FAILED
diff --git a/aiida_kkr/parsers/kkrimp.py b/aiida_kkr/parsers/kkrimp.py
@@ -132,22 +132,13 @@ def parse(self, debug=False, **kwargs):
         # create output node and link
         self.out('output_parameters', Dict(dict=out_dict))
 
-        # cleanup after parsing (only if parsing was successful), only works below aiida-core v2.0
-        if success:
-            if int(aiida_core_version.split('.')[0]) < 2:
-                # check if we should do the cleanup or not
-                cleanup_outfiles = False
-                if 'cleanup_outfiles' in self.node.inputs:
-                    cleanup_outfiles = self.node.inputs.cleanup_outfiles.value
-                if cleanup_outfiles:
-                    # reduce size of timing file
-                    self.cleanup_outfiles(files['out_timing'], ['Iteration number', 'time until scf starts'])
-                    # reduce size of out_log file
-                    self.cleanup_outfiles(files['out_log'], ['Iteration Number'])
-                    # delete completely parsed output files and create a tar ball to reduce size
-                    self.remove_unnecessary_files()
-                    self.final_cleanup()
-        else:
+        # return an exit code if parsing fails
+        if not success:
+            # check error file
+            exit_code = self.check_error_file(out_folder)
+            if exit_code is not None:
+                return exit_code
+            # if nothing was retuned here we mark the calculation with the general parsing failure
             return self.exit_codes.ERROR_PARSING_KKRIMPCALC
 
     def _check_file_existance(self, files, keyname, fname, icrit, file_errors):
@@ -169,70 +160,55 @@ def _check_file_existance(self, files, keyname, fname, icrit, file_errors):
             file_errors.append((icrit, crit_level + f" File '{fname}' not found."))
             files[keyname] = None
 
-    def cleanup_outfiles(self, fileidentifier, keyslist):
-        """open file and remove unneeded output"""
-        if fileidentifier is not None:
-            lineids = []
-            with self.retrieved.open(fileidentifier) as tfile:
-                txt = tfile.readlines()
-                for iline in range(len(txt)):
-                    for key in keyslist:  # go through all keys
-                        if key in txt[iline]:  # add line id to list if key has been found
-                            lineids.append(iline)
-            # rewrite file deleting the middle part
-            if len(lineids) > 1:  # cut only if more than one iteration was found
-                txt = txt[:lineids[0]] + \
-                    ['# ... [removed output except for last iteration] ...\n'] + \
-                    txt[lineids[-1]:]
-                with self.retrieved.open(fileidentifier, 'w') as tfilenew:
-                    tfilenew.writelines(txt)
-
-    def remove_unnecessary_files(self):
-        """
-        Remove files that are not needed anymore after parsing
-        The information is completely parsed (i.e. in outdict of calculation)
-        and keeping the file would just be a duplication.
-        """
-        # first delete unused files (completely in parsed output)
-        files_to_delete = [
-            KkrimpCalculation._OUT_ENERGYSP_PER_ATOM, KkrimpCalculation._OUT_ENERGYTOT_PER_ATOM,
-            KkrimpCalculation._SHAPEFUN
-        ]
-        for fileid in files_to_delete:
-            if fileid in self.retrieved.list_object_names():
-                self.retrieved.delete_object(fileid, force=True)
-
-    def final_cleanup(self):
-        """Create a tarball of the rest."""
-
-        # short name for retrieved folder
-        ret = self.retrieved
-
-        # Now create tarball of output
-        #
-        # check if output has been packed to tarfile already
-        # only if tarfile is not there we create the output tar file
-        if KkrimpCalculation._FILENAME_TAR not in ret.list_object_names():
-            # first create dummy file which is used to extract the full path that is given to tarfile.open
-            with ret.open(KkrimpCalculation._FILENAME_TAR, 'w') as f:
-                filepath_tar = f.name
-
-            # now create tarfile and loop over content of retrieved directory
-            to_delete = []
-            with tarfile.open(filepath_tar, 'w:gz') as tf:
-                for f in ret.list_object_names():
-                    with ret.open(f) as ftest:
-                        filesize = os.stat(ftest.name).st_size
-                        ffull = ftest.name
-                    if (
-                        f != KkrimpCalculation._FILENAME_TAR  # ignore tar file
-                        and filesize > 0  # ignore empty files
-                        # ignore files starting with '.' like '.nfs...'
-                        and f[0] != '.'
-                    ):
-                        tf.add(ffull, arcname=os.path.basename(ffull))
-                        to_delete.append(f)
-
-            # finally delete files that have been added to tarfile
-            for f in to_delete:
-                ret.delete_object(f, force=True)
+    def check_error_file(self, out_folder):
+        """Check if anything is in the error file and get some hints for error handler in restart workchain"""
+
+        # check if something was written to the error file
+        errorfile = self.node.attributes['scheduler_stderr']
+
+        if errorfile in out_folder.list_object_names():
+            # read
+            try:
+                with out_folder.open(errorfile, 'r') as efile:
+                    error_file_lines = efile.read()  # Note: read(), not readlines()
+            except OSError:
+                self.logger.error(f'Failed to open error file: {errorfile}.')
+                return self.exit_codes.ERROR_OPENING_OUTPUTS
+
+            # check lines in the errorfile
+            if error_file_lines:
+
+                if isinstance(error_file_lines, bytes):
+                    error_file_lines = error_file_lines.replace(b'\x00', b' ')
+                else:
+                    error_file_lines = error_file_lines.replace('\x00', ' ')
+
+                print(f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}')
+                self.logger.warning(
+                    f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}'
+                )
+
+                # check for some errors which we can fix automatically
+                if 'STOP Error creating newmesh!' in error_file_lines:
+                    return self.exit_codes.ERROR_RLOG_TOO_SMALL
+
+                # here we estimate how much walltime was available and consumed
+                try:
+                    time_avail_sec = self.node.attributes['last_job_info']['requested_wallclock_time_seconds']
+                    time_calculated = self.node.attributes['last_job_info']['wallclock_time_seconds']
+                    if 0.97 * time_avail_sec < time_calculated:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+                except KeyError:
+                    if 'TIME LIMIT' in error_file_lines.upper() or 'time limit' in error_file_lines:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+
+                # check for out of memory errors
+                OUT_OF_MEMORY_PHRASES = [
+                    'cgroup out-of-memory handler',
+                    'Out Of Memory',
+                ]
+                if any(phrase in error_file_lines for phrase in OUT_OF_MEMORY_PHRASES):
+                    return self.exit_codes.ERROR_NOT_ENOUGH_MEMORY
+
+                # Catch all exit code for an unknown failure
+                return self.exit_codes.ERROR_CALCULATION_FAILED
diff --git a/aiida_kkr/parsers/voro.py b/aiida_kkr/parsers/voro.py
@@ -125,5 +125,64 @@ def parse(self, debug=False, **kwargs):
         # create output node and link
         self.out('output_parameters', Dict(dict=out_dict))
 
+        # return an exit code if parsing fails
         if not success:
+            # check error file
+            exit_code = self.check_error_file(out_folder)
+            if exit_code is not None:
+                return exit_code
+            # if nothing was returned so far we have an unidentified failure of the parser
             return self.exit_codes.ERROR_VORONOI_PARSING_FAILED
+
+    def check_error_file(self, out_folder):
+        """Check if anything is in the error file and get some hints for error handler in restart workchain"""
+
+        # check if something was written to the error file
+        errorfile = self.node.attributes['scheduler_stderr']
+
+        if errorfile in out_folder.list_object_names():
+            # read
+            try:
+                with out_folder.open(errorfile, 'r') as efile:
+                    error_file_lines = efile.read()  # Note: read(), not readlines()
+            except OSError:
+                self.logger.error(f'Failed to open error file: {errorfile}.')
+                return self.exit_codes.ERROR_OPENING_OUTPUTS
+
+            # check lines in the errorfile
+            if error_file_lines:
+
+                if isinstance(error_file_lines, bytes):
+                    error_file_lines = error_file_lines.replace(b'\x00', b' ')
+                else:
+                    error_file_lines = error_file_lines.replace('\x00', ' ')
+
+                print(f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}')
+                self.logger.warning(
+                    f'The following was written into std error and piped to {errorfile} : \n {error_file_lines}'
+                )
+
+                # check if NACLSD is too small
+                if 'STOP clsgen: Dimension error (a).' in error_file_lines:
+                    return self.exit_codes.ERROR_NACLSD_TOO_SMALL
+
+                # here we estimate how much walltime was available and consumed
+                try:
+                    time_avail_sec = self.node.attributes['last_job_info']['requested_wallclock_time_seconds']
+                    time_calculated = self.node.attributes['last_job_info']['wallclock_time_seconds']
+                    if 0.97 * time_avail_sec < time_calculated:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+                except KeyError:
+                    if 'TIME LIMIT' in error_file_lines.upper() or 'time limit' in error_file_lines:
+                        return self.exit_codes.ERROR_TIME_LIMIT
+
+                # check for out of memory errors
+                OUT_OF_MEMORY_PHRASES = [
+                    'cgroup out-of-memory handler',
+                    'Out Of Memory',
+                ]
+                if any(phrase in error_file_lines for phrase in OUT_OF_MEMORY_PHRASES):
+                    return self.exit_codes.ERROR_NOT_ENOUGH_MEMORY
+
+                # Catch all exit code for an unknown failure
+                return self.exit_codes.ERROR_CALCULATION_FAILED