Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocl: fixed composing build flags and other improvements #769

Merged
merged 1 commit into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/daint.cscs.ch/ocl.build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
fi
cd "${HOME}/libxsmm"
git fetch
git checkout 05705477183444a82c8d9be8d7c2627efd6d67fa
git checkout 6c55e168d2053fa44f60f6985c370303bd84f9c1
make -j
cd ..

Expand Down
2 changes: 1 addition & 1 deletion src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ int c_dbcsr_acc_opencl_flags_atomics(const c_dbcsr_acc_opencl_device_t* devinfo,
}
assert(NULL != atomic_exp);
/* compose build parameters and flags */
result = LIBXSMM_SNPRINTF(flags, flags_maxlen, "-DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type,
result = LIBXSMM_SNPRINTF(flags, flags_maxlen, " -DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type,
atomic_ops, atomic_exp, barrier_expr);
}
}
Expand Down
12 changes: 6 additions & 6 deletions src/acc/opencl/smm/tune_multiply.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from opentuner import Result
from signal import signal, SIGINT
import tempfile
import socket
import shutil
import copy
import json
Expand Down Expand Up @@ -176,16 +175,15 @@ def __init__(self, args):
): # setup database (DB)
if args.database is None: # adjust DB-location
envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK"))
directory = "{}-{}".format(dbdir, os.getenv("HOSTNAME"))
if envrank:
self.idevice = int(envrank) % self.ndevices
directory = "{}-{}.db".format(dbdir, self.idevice)
else:
directory = "{}.db".format(dbdir)
directory += ".{}".format(self.idevice)
if os.path.isdir(directory):
shutil.rmtree(directory)
os.mkdir(directory)
self.args.database = "sqlite:///" + os.path.join(
directory, "{}.db".format(socket.gethostname())
directory, "{}.db".format(os.getpid())
)
if not self.args.label: # label for DB-session
self.args.label = "{}-{}-{}-s{}".format(
Expand Down Expand Up @@ -436,7 +434,7 @@ def merge_jsons(self, filenames):
s = 0
if 0 < gflops:
g = int(filename.split("-")[-1].split("g")[0])
s = gflops / g # slowdown
s = gflops / g if 0 < g else 0 # slowdown
if mtime < os.path.getmtime(filename):
if 0 < s:
retsld[1] = retsld[1] + math.log(s)
Expand Down Expand Up @@ -842,6 +840,8 @@ def handle_sigint(self, signum, frame):
# OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning)
if os.getenv("OPENCL_LIBSMM_SMM_WS") not in default_enable_tune:
os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws)
if os.getenv("OPENCL_LIBSMM_SMM_AL") not in default_enable_tune:
os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al)
# fix tunables according to level of tuning
if 1 <= args.tlevel or 0 > args.tlevel:
os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm)
Expand Down
78 changes: 43 additions & 35 deletions src/acc/opencl/smm/tune_multiply.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,39 +83,19 @@ then
break;;
esac
done
# how to print standard vs error messages
if [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then
ECHO=">&2 echo"
else
ECHO="echo"
fi
eval "${ECHO} \"Usage: $0 [options] [<triplet-spec>]\""
eval "${ECHO} \" Options must precede triplet specification\""
eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\""
eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\""
eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\""
eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\""
eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\""
eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\""
eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\""
eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\""
eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\""
eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\""
eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\""
eval "${ECHO} \" -m|--limit N: limit any shape extent to N\""
eval "${ECHO} \" -n|--triplets N: limit number of triplet\""
eval "${ECHO} \" -k|--specid N: predefined triplets\""
eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\""
eval "${ECHO} \" 0: 201 kernels\""
eval "${ECHO} \" 10: 1266 kernels\""
eval "${ECHO} \" <triplet-spec>, e.g., 134 kernels\""
eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\""
eval "${ECHO}"
# default settings
# default/basic settings
if [ ! "${BATCHSIZE}" ]; then BATCHSIZE=0; fi
if [ ! "${JSONDIR}" ]; then JSONDIR=.; fi
if [ ! "${TLEVEL}" ]; then TLEVEL=-1; fi
if [ ! "${NPARTS}" ]; then NPARTS=1; fi
if [ ! "${PART}" ]; then PART=1; fi
if [ ! "${NPARTS}" ]; then NPARTS=${PMI_SIZE:-1}; fi
if [ ! "${PART}" ]; then PART=${PMI_RANK:-0}; PART=$((PART+1)); fi
if [ ! "${WAIT}" ] && [ "1" = "${NPARTS}" ]; then WAIT=0; fi
# sanity checks
if [ "0" != "$((NPARTS<PART))" ]; then
>&2 echo "ERROR: part-number ${PART} is larger than the requested ${NPARTS} parts!"
Expand All @@ -131,7 +111,6 @@ then
exit 1
elif [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then
if [ "${UPDATE}" ] && [ "0" != "${UPDATE}" ]; then
if [ ! "${TLEVEL}" ] || [ "0" != "$((0>TLEVEL))" ]; then TLEVEL=1; fi
MNKS=$(${SED} -n "s/.*tune_multiply-..*-\(..*x..*x.[^-]*\)-..*gflops\.json/\1/p" <<<"${JSONS}" \
| ${SORT} -u -n -tx -k1,1 -k2,2 -k3,3)
elif [ "${SPECID}" ]; then
Expand All @@ -142,6 +121,30 @@ then
else
exit 0
fi
if [ ! "${WAIT}" ]; then
eval "${ECHO} \"Usage: $0 [options] [<triplet-spec>]\""
eval "${ECHO} \" Options must precede triplet specification\""
eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\""
eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\""
eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\""
eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\""
eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\""
eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\""
eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\""
eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\""
eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\""
eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\""
eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\""
eval "${ECHO} \" -m|--limit N: limit any shape extent to N\""
eval "${ECHO} \" -n|--triplets N: limit number of triplet\""
eval "${ECHO} \" -k|--specid N: predefined triplets\""
eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\""
eval "${ECHO} \" 0: 201 kernels\""
eval "${ECHO} \" 10: 1266 kernels\""
eval "${ECHO} \" <triplet-spec>, e.g., 134 kernels\""
eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\""
eval "${ECHO}"
fi
if [ "${MNKS}" ]; then
if [ "${BOUNDL}" ] || [ "${BOUNDU}" ]; then
if [ ! "${BOUNDL}" ]; then BOUNDL=0; elif [ ! "${BOUNDU}" ]; then BOUNDU=0; fi
Expand Down Expand Up @@ -187,10 +190,12 @@ then
PARTSIZE=$(((NTRIPLETS+NPARTS-1)/NPARTS))
PARTOFFS=$(((PART-1)*PARTSIZE))
PARTSIZE=$((PARTSIZE<=(NTRIPLETS-PARTOFFS)?PARTSIZE:(NTRIPLETS-PARTOFFS)))
if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then
echo "Session ${PART} of ${NPARTS} part(s)."
else
echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!"
if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then
if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then
echo "Session ${PART} of ${NPARTS} part(s)."
else
echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!"
fi
fi
if [ ! "${MAXTIME}" ] && [[ (! "${CONTINUE}" || \
"${CONTINUE}" = "false" || \
Expand All @@ -200,10 +205,12 @@ then
MAXTIME=160
fi
if [ "${MAXTIME}" ] && [ "0" != "$((0<MAXTIME))" ]; then
HRS=$((MAXTIME*PARTSIZE/3600))
MNS=$(((MAXTIME*PARTSIZE-HRS*3600+59)/60))
echo "Tuning ${PARTSIZE} kernels in this session will take about" \
"${MAXTIME}s per kernel and ${HRS}h${MNS}m in total."
if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then
HRS=$((MAXTIME*PARTSIZE/3600))
MNS=$(((MAXTIME*PARTSIZE-HRS*3600+59)/60))
echo "Tuning ${PARTSIZE} kernels in this session will take about" \
"${MAXTIME}s per kernel and ${HRS}h${MNS}m in total."
fi
MAXTIME="--stop-after=${MAXTIME}"
else
echo "Tuning ${PARTSIZE} kernels will take an unknown time (no limit given)."
Expand All @@ -227,8 +234,9 @@ then
MNKPART=$(${CUT} -d' ' -f $((PARTOFFS+1))-$((PARTOFFS+PARTSIZE)) <<<"${MNKS}")
for MNK in ${MNKPART}; do
if [ "0" != "$(((N)<PARTSIZE))" ]; then
if [ "1" != "${NPARTS}" ] && [ "${HOSTNAME}" ]; then STEP="@${HOSTNAME}"; fi
echo
echo "[$((N+1))/${PARTSIZE}]: auto-tuning ${MNK}-kernel..."
echo "[$((N+1))/${PARTSIZE}]${STEP}: auto-tuning ${MNK}-kernel..."
# avoid mixing database of previous results into new session
${RM} -rf ./opentuner.db
eval "${HERE}/tune_multiply.py ${MNK} ${DELETE} -p ${JSONDIR} -s ${BATCHSIZE} -a ${TLEVEL} ${MAXTIME}"
Expand Down
Loading