benchmark.py

# benchmark.py
# Reads in environment variables from environment.env
# Initializes the environment for osu-benchmark
# This Python Script needs to be run when new nodes were added / removed

# DEPENDENCIES
import os, csv, dotenv, subprocess, sys, random
import pandas as pd

# ENVIRONMENT VARIABLES
# These are read from environment.env in the working directory.
# Python-dotenv format is used.
cwd = os.getcwd()
dotenv.load_dotenv(cwd+'/environment.env')

sinfo = os.getenv('SINFO') # path to sinfo command
sbatch = os.getenv('SBATCH') # path to sbatch command
squeue = os.getenv('SQUEUE') # path to squeue command
data_dir = os.getenv('DATA_DIR') # path where output files are stored
inst_dir = os.getenv('INST_DIR') # path to osu-mpi module
osu_module_name = os.getenv('OSU_MODULE_NAME') # name of module (with version)
py_module_name = os.getenv('PY_MODULE_NAME') # name of python module used in setting up venv
inst_include = inst_dir + osu_module_name + '/osu-benchmarks'
sample = os.getenv('JOBS_PER_HR') # number of jobs per hour
njobs = int(os.getenv('N_JOBS')) # number of repetitions for benchmark

# These are predefined list of target nodes that will be analyzed through benchmark
# TODO: In the future, it may be sensible to change this to predefined list of architectures.
# As of now (May 2020), newer nodes (node1333+) do not explicitly show architecture when sinfo is invoked.
# arch = os.getenv('ARCH').split(',') can replace these.
node_start = int(os.getenv('NODE_START'))
node_finish = int(os.getenv('NODE_FINISH'))

# GLOBAL VARIABLES
# Ideally, the line that follows should be removed when TODO in line 25 is implemented.
arch_list = ['sandy', 'ivy', 'haswell', 'broadwell', 'skylake', 'unspecified']
job_list =['bibw', 'latency']

# CLASS DEFINITIONS

# Dictionary with autovivification
# Refer to https://en.wikipedia.org/wiki/Autovivification#Python for details
class Tree(dict):
	def __missing__(self, key):
		value = self[key] = type(self)()
		return value

# Class containing current node states
class NodeStat():
	def __init__(self):
		# First collect the list of nodes from sinfo
		nodels_r = subprocess.run([sinfo + " --N -o %N,%f"], shell=True, stdout = subprocess.PIPE)
		nodels = nodels_r.stdout.decode()

		# Order the information into usable format
		self.__nodeInfo = {} # nodeXXX: 32core, intel, haswell
		self.__nodeArch = {} # haswell: nodeXXX, nodeYYY, nodeZZZ

		for group in nodels.split('\n'):
			values = group.split(',')
			# If key is included in the list of target arch
			if (values[0] >= node_start and values[0] <= node_finish):
				key = values[0]
				self.__nodeInfo[key] = []
				archSpecified = False
				for v in values[1:]:
					self.__nodeInfo[key].append(v)
					if (v in arch_list and not archSpecified):
						self.__nodeArch[v] = key
						archSpecified = True
				if not archSpecified:
					# For node1333+
					self.__nodeArch['unspecified'] = key

		# Set some private variables
		# none for now / placeholder

	def randNode(self, arch = 'any', ratio):
		# Return random list of pair of nodes with specified architecture of length determined by ratio
		# If unspecified, return any
		# First determine number of nodes to return
		if arch == 'any':
			arch = arch_list[random.randrange(0, len(arch_list))]

		nRandNode = nNodes * nNodes / 2 * ratio
		rNodePairs = []
		nodesList = list(self.__nodeArch[arch].items())
		nNodes = len(nodesList)

		# Now sample pairs of nodes to benchmark
		while nRandNode > 0:
			i = random.randrange(0, nNodes)
			j = random.randrange(i+1, nNodes)
			rNodePairs.append([nodesList[i], nodesList[j]])

		return rNodePairs

class SLURMJob():
	def __init__(self):
		self.__scriptList = []

	def __init__(self, nodePairList):
		self.__scriptList = []
		self.__buildList(nodePairList)

	def buildList(self, nodePairList)
		# For each node pairs
		for nodePair in nodePairList:
			# Build SLURM Job script from scratch
			curScript = [
			'#!/bin/bash',
			'#Autogenerated SLURM script for osu-benchmark',
			]

			curScript.append(''.join(['#SBATCH --time=0:', str(njobs * 5),':00']))
			curScript.append('#SBATCH --nodes=2')
			curScript.append('#SBATCH --tasks-per-node=1')
			curScript.append('#SBATCH --exclusive')
			curScript.append(''.join(['#SBATCH -o ', data_dir, '/%j.out']))

			# Line 5 / Load Modules
			curScript.append(''.join(['module load ', py_module_name, ' ', osu_module_name]))
			# Line 6 / Source venv
			curScript.append(''.join(['source ', inst_include, '/../osu_env/bin/activate']))
			# For each job
			for jobtype in job_list:
				# For each iteration
				for iteration in njobs:
					curScript.append(''.join([inst_include, '/run_osu_', jobtype]))

			self.__scriptList.append(curScript)

	def getScript(self):
		return self.__scriptList

def main():
	# First build NodeStat object
	curNode = NodeStat()
	# Then get list of random pairs of Nodes
	nodePairList = curNode.randNode()

	# Build SLURM submission script
	curSLURMjob = SLURMJob(nodePairList)
	# Read them out
	scriptList = curSLURMjob.getScript()
	# Run each script in the list
	for i in range(len(scriptList)):
	# Write out and use subprocess to execute on SLURM
		with open(data_dir + '/SLURMinput_' + str(i), 'w', newline="") as file:
			file.write("\n".join(str(line) for lin in scriptList[i]))
		subprocess.run(''.join(['sbatch ', data_dir, '/SLURMinput_', str(i)]))
	# TODO

# Execute if main
if __name__ == 'main':
	main() # change to something appropriate