-
Notifications
You must be signed in to change notification settings - Fork 119
slurm
Before installing SLURM, install munge
yum -y install munge
ansible all -a "yum -y install munge"
Create a Secret Key on the head node. This also will properly setup all of the directories with permission and ownership
create-munge-key
Repeat this step on all compute nodes
clush -ab "/usr/sbin/create-munge-key"
Now copy the key to all compute nodes
clush -acp /etc/munge/munge.key
systemctl start munge
systemctl enable munge
clush -ab "systemctl start munge"
clush -ab "systemctl enable munge"
Generate a credential on stdout:
munge -n
Check if a credential can be locally decoded:
munge -n | unmunge
Check if a credential can be remotely decoded:
munge -n | ssh somehost unmunge
Download latest slurm from their site https://computing.llnl.gov/linux/slurm/download.html
for accounting later we'll need a database, more on this in the Accounting section.
yum -y install mariadb mariadb-devel
groupadd slurm
useradd -u 1000 -g slurm slurm
mkdir -p /var/log/slurm
chmod 755 /var/log/slurm
Add this same user on all compute nodes
ansible all -a "groupadd slurm"
ansible all -a "useradd -u 1000 -g slurm slurm"
ansible all -a "mkdir -p /var/log/slurm"
ansible all -a "chmod 755 /var/log/slurm"
Build a set of rpms that we can install on the login and compute nodes
rpmbuild -ta slurm-14.03.8.tar.bz2
Install the RPMs
yum -y install slurm*.rpm
Place the RPMS in a shared location (such as /home) and install on compute nodes to not stress the file share, the RPMs can be installed on each compute node individually
[root@system]# for i in `seq 0 7`; do ssh compute0$i "yum -y install /home/rpms/slurm-* "; done;
Open ports on internal firewall for slurm
firewall-cmd --zone=internal --add-port=6817/tcp --permanent
success
firewall-cmd --zone=internal --add-port=6818/tcp --permanent
success
firewall-cmd --zone=internal --add-port=6819/tcp --permanent
success
Set a rule set to accept all incoming connections from compute nodes. Verify you have this rich rule
firewall-cmd --zone=internal --list-all internal (active) interfaces: enp2s0f1 sources: 192.168.1.0/24 services: dhcpv6-client dns ipp-client mdns nfs samba-client ssh ports: 443/tcp 6817/tcp 80/tcp 464/tcp 9443/tcp 88/udp 464/udp 6819/tcp 88/tcp 6818/tcp 7389/tcp 123/udp 9445/tcp 69/tcp 9444/tcp 53/tcp 69/udp 389/tcp 53/udp 636/tcp masquerade: no forward-ports: icmp-blocks: rich rules: rule family="ipv4" source address="192.168.1.0/24" accept
If it is not set, set it with this command.
firewall-cmd --permanent --zone=internal --add-rich-rule='rule family="ipv4" source address="192.168.1.0/24" accept'
Restart the firewall
firewall-cmd --reload
Create the file /etc/slurm/slurm.conf
#
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=testsystemname
ControlMachine=testsystemname
ControlAddr=192.168.1.1
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/tmp/slurm
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=2
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=1
#
# TIMERS
SlurmctldTimeout=120
SlurmdTimeout=60
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/linear
FastSchedule=1
PriorityType=priority/multifactor
PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
PriorityWeightFairshare=100000
PriorityWeightAge=1000
PriorityWeightPartition=10000
PriorityWeightJobSize=1000
PriorityMaxAge=14-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld
SlurmdDebug=1
SlurmdLogFile=/var/log/slurm/slurmd
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=DEFAULT Sockets=2 CoresPerSocket=20 ThreadsPerCore=2 State=UNKNOWN
PartitionName=normal Nodes=ALL Default=YES MaxTime=480 State=UP
Next setup accounting
Install MariaDB
systemctl start mariadb
Grant permissions
mysql -u root -e "GRANT ALL ON slurm_acct_db.* TO 'slurm'@'localhost' identified by 'password' with grant option;"
Create the slurmdbd.conf file
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
Populate the accounting database
slurmdbd
Now add your cluster
sacctmgr add cluster testsystemname Adding Cluster(s) Name = testsystemname Would you like to commit changes? (You have 30 seconds to decide) (N/y): y
Add a billable account
sacctmgr add account testgroup Cluster=testsystemname Description="Default Account" Organization="Default Organization" Adding Account(s) testgroup Settings Description = testgroup account Organization = Default Organization Associations A = testgroup C = testsystemname Settings Would you like to commit changes? (You have 30 seconds to decide) (N/y): y
Add a user to this account
sacctmgr add user johnny.cab DefaultAccount=testgroup Adding User(s) johnny.cab Settings = Default Account = testgroup Associations = U = johnny.cab A = testgroup C = testsystemname Would you like to commit changes? (You have 30 seconds to decide) (N/y): y
Start slurm on the head node
systemctl start slurm
Copy all slurm config files to computes
clush -apc /etc/slurm/*
Start slurm on computes
clush -a "/etc/init.d/slurm startclean"
enable on boot on master
systemctl enable slurm
systemctl enable slurmdbd
enable on boot on computes
clush -ab "chkconfig slurm on"
Users can now submit jobs to the "normal" queue using standard submission scripts and the sbatch
commmand.