From d553ead08783bd002fb243a885805a614951f3fb Mon Sep 17 00:00:00 2001 From: Michael Zingale Date: Sun, 17 Sep 2023 12:08:29 -0400 Subject: [PATCH] remove titan --- job_scripts/titan/process.titan | 242 -------------------------------- job_scripts/titan/titan.run | 85 ----------- 2 files changed, 327 deletions(-) delete mode 100755 job_scripts/titan/process.titan delete mode 100755 job_scripts/titan/titan.run diff --git a/job_scripts/titan/process.titan b/job_scripts/titan/process.titan deleted file mode 100755 index a68fb02..0000000 --- a/job_scripts/titan/process.titan +++ /dev/null @@ -1,242 +0,0 @@ -#!/bin/bash -l - -#---------------------------------------------------------------------------- -# user modifiable variables: - -# pidfile is a lock file that is used to make sure that only one instance -# of this script is working on the current directory -pidfile=process.pid - - -# set the prefix of the plotfiles and checkpoint files -plt_prefix=*plt -chk_prefix=*chk - -# directory to archive to on HPSS -- set this to the working directory -work_dir=`pwd` -HPSS_DIR=`basename $work_dir` - -# set HTAR command -HTAR=/sw/xk6/hsi/5.0.2.p1/sles11.5/bin/htar - -# path to the ftime executable -- used for making a simple ftime.out file -# listing the name of the plotfile and its simulation time. -# note that the gfortran-compiled version of ftime will fail to run if either -# the PrgEnv-gnu or gcc module isn't loaded (it needs some gcc libraries). -FTIME_EXE=ftime.gnu.interlagos.ex - - -#---------------------------------------------------------------------------- -# initialization stuff - -# check to make sure that the lock file does not already exist. -if [ -f $pidfile ]; then - echo 2>&1 "process lock file " $pidfile " already exists" - exit -1 -fi - -# create the lock file -echo $$ > $pidfile - -# if our process if killed, remove the lock file first -trap '/bin/rm -f $pidfile' EXIT HUP TERM XCPU KILL - -# Number of seconds to sleep before checking again. -N=60 - - - -#---------------------------------------------------------------------------- -# make storage directories - -# once we process a file, we will move the plotfiles into the plotfiles/ -# directory. This then hides them from the script, so if the system -# later purges the files in the pltXXXXX directory and the .processed -# file, we don't overwrite our archived data with a tarred empty -# directory structure. We do the same with the checkpoint files (using -# checkfiles/) - -if [ ! -d plotfiles ]; then - mkdir plotfiles -fi - -if [ ! -d checkfiles ]; then - mkdir checkfiles -fi - - -#---------------------------------------------------------------------------- -# the processing function - -# Process Files. Once a plotfile is successfully processed, we will output -# a file pltXXXXX.processed (checkpoint files are only archived, with a -# chkXXXXX.processed file appearing once the archiving is successful). -# Subsequent invocations of this routine will skip over any plotfiles or -# checkpoint files that have a corresponding .processed file. - - -function process_files -{ - if [ ! -f $pidfile ]; then - echo "process: $pidfile has been removed, exiting" - exit - fi - - - # plotfiles - - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the plotfiles/ - # directory. The trailing sed command removes any "./" prefixes, which can break later - # htar commands. - pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort | sed "s/\.\///") - pltlist6=$(find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort | sed "s/\.\///") - pltlist7=$(find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort | sed "s/\.\///") - - pltlist="$pltlist5 $pltlist6 $pltlist7" - - if [ "$pltlist" ]; then - nl=$(echo "$pltlist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - pltlist="" - else - pltlist=$(echo "$pltlist" | head -$nl) - fi - fi - - - for dir in ${pltlist} - do - if [ -d ${dir} ]; then - # only work on the file if there is not a .processed file in the - # main directory or the plotfiles/ directory - if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then - - # do processing - - # store the file on HPSS - # Note that the -P flag tells HTAR to create intermediate directories - # if they don't exist. - ${HTAR} -P -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # output the plotfile name and simulation time to ftime.out - if [ -f "${FTIME_EXE}" ] ; then - ./${FTIME_EXE} ${dir} >> ftime.out - fi - - # remove the htar temporary file - rm ${dir}.htar - - # move the plotfile into the plotfiles directory - mv ${dir} plotfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed plotfiles/ - - fi - fi # end test of whether plotfile already processed - fi # end test of whether plotfile is a directory (as it should be) - done - - - # checkpoint files - - # Take all but the final chk file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as chkXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the checkfiles/ - # directory. The trailing sed command removes any "./" prefixes, which can break later - # htar commands. - chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?????" -print | sort | sed "s/\.\///") - chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??????" -print | sort | sed "s/\.\///") - chklist7=$(find . -maxdepth 1 -type d -name "${chk_prefix}???????" -print | sort | sed "s/\.\///") - - chklist="$chklist5 $chklist6 $chklist7" - - if [ "$chklist" ]; then - nl=$(echo "$chklist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - chklist="" - else - chklist=$(echo "$chklist" | head -$nl) - fi - fi - - - for dir in ${chklist} - do - if [ -d ${dir} ]; then - # only work on the file if there is not a .processed file in the - # main directory or the checkfiles/ directory - if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then - # store the file on HPSS - # Note that the -P flag tells HTAR to create intermediate directories - # if they don't exist. - ${HTAR} -P -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # remove the htar temporary file - rm ${dir}.htar - - # move the checkpoint file into the checkfiles directory - mv ${dir} checkfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed checkfiles/ - - fi - fi - fi - done -} - - -#---------------------------------------------------------------------------- -# the main function - -# archive any diagnostic files first -- give them a unique name, appending -# the date string, to make sure that we don't overwrite anything -datestr=$(date +"%Y%m%d_%H%M_%S") -diag_files=$(find . -maxdepth 1 -name "diag_*.out" -print) -ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) - -if [ "${diag_files}" ] || [ "${ftime_files}" ]; then - # Note that the -P flag tells HTAR to create intermediate directories - # if they don't exist. - ${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${diag_files} ${ftime_files} inputs* *.run >> /dev/null -fi - -# Loop, waiting for plt and chk directories to appear. -while true -do - process_files - sleep $N -done diff --git a/job_scripts/titan/titan.run b/job_scripts/titan/titan.run deleted file mode 100755 index 5b6d888..0000000 --- a/job_scripts/titan/titan.run +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -l -#PBS -A ast106 -#PBS -N [job name] -#PBS -j oe -#PBS -q batch -#PBS -l walltime=06:00:00,nodes=512 - -# Set the name of the executable and input file -executableFile="Maestro3d.gnu.interlagos.MPI.OMP.ex" -inputsFile="inputs_3d_1lev.5120" - -# this script runs with 4 threads per MPI task, 4 MPI tasks/node (2 per NUMA node), and 512 nodes on titan -# - -export PSC_OMP_AFFINITY=FALSE -export OMP_NUM_THREADS=4 - -cd $PBS_O_WORKDIR - -# run the compression script to tar up the plot and checkpoint files -# as they are created. -./process.titan & -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL - -# find the latest restart file -- first look for one with 7 digits then fall -# back to 6 -restartFile=$(find . -maxdepth 1 -type d -name "*chk???????" -print | sort | tail -1 | cut -c 3-) - -# the Header is the last thing written -- check if it's there, otherwise, -# fall back to the second-to-last check file written -if [ ! -f ${restartFile}/Header ]; then - # how many *chk??????? files are there? if only one, then skip - nl=$(find . -maxdepth 1 -type d -name "*chk???????" -print | sort | wc -l) - if [ $nl -gt 1 ]; then - restartFile=$(find . -maxdepth 1 -type d -name "*chk???????" -print | sort | tail -2 | head -1 | cut -c 3-) - else - restartFile="" - fi -fi - -# if the above checks failed, then there are no valid 7-digit chk files, so -# check the 6-digit ones -if [ "${restartFile}" = "" ]; then - restartFile=$(find . -maxdepth 1 -type d -name "*chk??????" -print | sort | tail -1 | cut -c 3-) - - # make sure the Header was written, otherwise, check the second-to-last - # file - if [ ! -f ${restartFile}/Header ]; then - # how many *chk?????? files are there? if only one, then skip - nl=$(find . -maxdepth 1 -type d -name "*chk??????" -print | sort | wc -l) - if [ $nl -gt 1 ]; then - restartFile=$(find . -maxdepth 1 -type d -name "*chk??????" -print | sort | tail -2 | head -1 | cut -c 3-) - else - restartFile="" - fi - fi -fi - -if [[ ${executableFile} =~ .*Maestro.* ]]; then - restartBaseString="maestro.restart_file" -else - restartBaseString="amr.restart" -fi - -# restartFile will be empty if no chk files are found -- i.e. new run -if [ "${restartFile}" = "" ]; then - restartString="" -else - restartString="${restartBaseString}=${restartFile}" - echo "Restarting with: " ${restartString} -fi - -# Titan has 18688 physical nodes, each of which has 16 cores and 2 NUMA nodes -# -# -n is the total number of MPI tasks (should be nodes*-S*2) -# -S is the number of MPI tasks per NUMA node -# -d is the number of OpenMP threads per MPI task (must match OMP_NUM_THREADS) -# -ss forces MPI tasks to only allocate memory in their local NUMA node. -# This can boost performance by preventing costly remote memory I/O, though -# it also restricts the amount of memory available to MPI tasks. - -aprun -n 2048 -S 2 -d 4 -ss ./${executableFile} ${inputsFile} ${restartString} - -rm -f process.pid