forked from TACC/launcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
launcher
executable file
·151 lines (134 loc) · 4.22 KB
/
launcher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/bash
#------------------------------------------------
# Simple serial job launcher for TACC systems
#
# 03/29/2019 Si Liu
# Merge all versions together
# Generate the new 3.5 verision now
#
# 9/9/2006 - l.wilson
# 1/21/2011 - l.wilson - Modified to spawn only
# 1 ssh/host
# 7/11/2012 - l.wilson - Added scheduling options
# 11/30/2012 - l.wilson - Added Xeon Phi support
#
# Texas Advanced Computing Center
# The University of Texas at Austin
#------------------------------------------------
sleep $[ ( $RANDOM % 30 ) + 1 ]s
NETCAT_CMD="nc ${LAUNCHER_NETCAT_FLAGS}"
if [[ "x$LAUNCHER_JOB_FILE" == "x" ]]
then
if [[ "$LAUNCHER_ON_PHI" == "1" ]];then
cd $5
export LAUNCHER_JOB_FILE="$6"
else
cd $2
export LAUNCHER_JOB_FILE="$3"
fi
fi
if [[ "x$LAUNCHER_JNOBS" == "x" ]]
then
export LAUNCHER_NJOBS=`cat $LAUNCHER_JOB_FILE | wc -l`
fi
if [[ "$LAUNCHER_ON_PHI" == "1" ]];then
LAUNCHER_DYN_SRV="$LAUNCHER_PHI_DYN_COUNT $LAUNCHER_PHI_DYN_COUNT_PORT"
else
LAUNCHER_DYN_SRV="$LAUNCHER_DYN_COUNT $LAUNCHER_DYN_COUNT_PORT"
fi
#if [[ "$LAUNCHER_TSK_ID" == "0" ]];then
# if [[ "$LAUNCHER_ON_PHI" == "1" ]];then
# echo "--------------------------------------------------"
# echo "Launcher (on Intel Xeon Phi) -> $LAUNCHER_NPROCS processors allocated."
# echo "Launcher (on Intel Xeon Phi) -> $LAUNCHER_NJOBS total tasks to complete."
# echo "--------------------------------------------------"
# echo " "
# else
# echo "--------------------------------------------------"
# echo "Launcher -> $LAUNCHER_NPROCS processors allocated."
# echo "Launcher -> $LAUNCHER_NJOBS total tasks to complete."
# echo "--------------------------------------------------"
# echo " "
# fi
#else
# sleep 0.5
#fi
#---------------------------------
# Setup initial JID for each task
#---------------------------------
if [[ $LAUNCHER_SCHED == "interleaved" ]]; then
# Setup stride for assigning serial tasks
export LAUNCHER_JID=`expr $LAUNCHER_TSK_ID + 1`
elif [[ $LAUNCHER_SCHED == "block" ]]; then
blocks=`expr $LAUNCHER_NJOBS / $LAUNCHER_NPROCS`
if [[ `expr $blocks \* $LAUNCHER_NPROCS` -lt $LAUNCHER_NJOBS ]]; then
blocks=`expr $blocks + 1`
fi
export LAUNCHER_JID=`expr $LAUNCHER_TSK_ID \* $blocks + 1`
nextblock=`expr \( $LAUNCHER_TSK_ID + 1 \) \* $blocks + 1`
else
# Grab the JID to run from The Count
export LAUNCHER_JID=`$NETCAT_CMD $LAUNCHER_DYN_SRV`
fi
COMPLETE="false"
#Check to see if the job id is larger than the task id. If so, termine
#BUGFIX for github issue #16 https://github.com/TACC/launcher/issues/16
if [[ ! "$LAUNCHER_JID" == "Done" ]]
then
if [[ $LAUNCHER_JID -gt $LAUNCHER_NJOBS ]]
then
COMPLETE="overallocate"
fi
else
COMPLETE="overallocate"
fi
while [[ $COMPLETE == "false" ]]; do
if [[ ! "x$LAUNCHER_JID" == "x" ]]
then
export CMD=`grep -m $LAUNCHER_JID . $LAUNCHER_JOB_FILE | tail -n 1`
START_TIME=`date +"%s"`
echo "Launcher: Task $LAUNCHER_TSK_ID running job $LAUNCHER_JID on `hostname` ($CMD)"
eval $LAUNCHER_HWLOC_CMD $CMD
END_TIME=`date +"%s"`
EXEC_TIME=`expr $END_TIME - $START_TIME`
echo "Launcher: Job $LAUNCHER_JID completed in $EXEC_TIME seconds."
fi
if [[ $LAUNCHER_SCHED == "interleaved" ]]; then
export LAUNCHER_JID=`expr $LAUNCHER_JID + $LAUNCHER_NPROCS`
if [ "x$LAUNCHER_JID" == "x" -o $LAUNCHER_JID -gt $LAUNCHER_NJOBS ]; then
COMPLETE="true"
fi
elif [[ $LAUNCHER_SCHED == "block" ]]; then
export LAUNCHER_JID=`expr $LAUNCHER_JID + 1`
if [[ $LAUNCHER_JID -ge $nextblock ]]; then
COMPLETE="true"
fi
else
RETRY=0
while [[ "$RETRY" -lt 3 ]]
do
export LAUNCHER_JID=`$NETCAT_CMD $LAUNCHER_DYN_SRV`
if [[ "x$LAUNCHER_JID" == "x" ]]
then
if [[ "$RETRY" -lt 3 ]]
then
echo "WARNING: No response from dynamic task server. Retrying..."
RETRY=`expr $RETRY + 1`
else
echo "ERROR: No response from dynamic task server after 3 retries. Exiting."
exit
fi
elif [[ "$LAUNCHER_JID" == "Done" ]]
then
RETRY=4
COMPLETE="true"
else
RETRY=4
fi
done
fi
done
if [[ $COMPLETE == "true" ]]
then
echo "Launcher: Task $LAUNCHER_TSK_ID done. Exiting."
fi