forked from rutgers-oarc/ern-poc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnortheast_federation_recipe.txt
248 lines (214 loc) · 8.84 KB
/
northeast_federation_recipe.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
The cast:
ern.hpc.rutgers.edu - slurm database, ldap server, ganglia web
khan.hpc.rutgers.edu - cephfs cluster
riker.hpc.rutgers.edu - perfsonar web
mace - 3-node physical cluster at Rutgers
mace - login node, scheduler, local nfs
mace[1-2] - compute nodes
~/ern-poc/fedconfig - config files - copy and modify for your site
Note: you will need to acquire the ldap cert, munge key and ceph key from
Rutgers OARC - these are not included in fedconfig
OS: Centos 7.4
OpenHPC recipe: 1.3.4
Scheduler: slurm 17.11
Filesystem: cephfs, exported via nfs to compute nodes
Authz, authn: ldap, replace with shibboleth
Other: ganglia, perfsonar, singularity
Eval: kubernetes
# mace replaced with <scheduler> below
# mace.local replaced with <scheduler.local>
# mace1-2 replaced with <node1-2>
# follow bare metal install for <scheduler> and <node1-2>
---
# bare metal install
# install centos 7.4 minimal
# set up ssh keys from local management servers
# git clone ern repo
git clone https://github.com/rutgers-oarc/ern-poc.git
yum -y update
yum -y groupinstall "Hardware Monitoring Utilities" "Performance Tools" "Development Tools" "Network File System Client" "Console Internet Tools" "Networking Tools" "System Administration Tools" "System Management" “Compatibility Libraries” “Platform Development”
#
# copy templates and add local entries at the top
# ansible will append later
cp ~/ern-poc/fedconfig/hosts /etc
cp ~/ern-poc/fedconfig/hosts.allow /etc
cp ~/ern-poc/fedconfig/hosts.deny /etc
vi /etc/hosts
vi /etc/hosts.allow
vi /etc/hosts.deny
# OpenHPC
yum -y install http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
yum -y update
yum -y install ohpc-base
#
sed -i s/SELINUX=enforcing/SELINUX=permissive/ /etc/selinux/config
setenforce 0
yum -y install nmap iptraf-ng iperf3 hdparm msr-tools mlocate trafshow yum-utils
mkdir /scratch;chmod 1777 /scratch;ls -ld /scratch
# done with bare metal install
# on <scheduler>
# firewall
firewall-cmd --list-all-zones
firewall-cmd --zone=public --remove-service=dhcpv6-client
firewall-cmd --info-zone=public
# add internal interface to trusted zone
# ZONE=trusted
vi /etc/sysconfig/network-scripts/ifcfg-enp6s4f0
# add firewall rules for campus management networks
firewall-cmd --zone=public --add-rich-rule='rule family="ipv4" source address="a.b.c.d/e" accept'
firewall-cmd --zone=public --add-port=6817/tcp
cp ~/ern-poc/fedconfig/perfsonar.xml /etc/firewalld/services/
firewall-cmd --reload
firewall-cmd --zone=public --add-service=perfsonar
firewall-cmd --runtime-to-permanent
# if <scheduler> is the gateway for the nodes
firewall-cmd --zone=public --add-masquerade
firewall-cmd --runtime-to-permanent
# this should leave you with ssh, perfsonar and 6817/tcp open in zone public.
# these will be restricted with hosts.allow, ps limits and munge keys.
# set up ssh keys to manage compute nodes
ssh-keygen
ssh-copy-id <node1>
ssh-copy-id <node2>
pdsh -w <node>[1-2] "uptime"
# if <scheduler> is the gateway for the nodes
echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf
sysctl -p /etc/sysctl.conf
# ldap
yum install sssd openldap-clients
authconfig --updateall --enableldap --enableldapauth --ldapserver=ldap://ern.hpc.rutgers.edu:389 --ldapbasedn=dc=ufp,dc=hpc,dc=rutgers,dc=edu --enableldaptls --enableldapstarttls
# acquire the ldap cert from Rutgers OARC
cp ern_hpc_rutgers_edu_interm.cer /etc/openldap/cacerts
cacertdir_rehash /etc/openldap/cacerts/
systemctl restart sssd
id babbott
# slurm, munge
# check that user slurm is already in ldap
id slurm
yum install ohpc-slurm-server ohpc-slurm-client
# acquire munge.key from Rutgers OARC
cp munge.key /etc/munge
chown munge:munge /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
systemctl restart munge
# Modify /etc/slurm/slurm.conf with the below, replacing mace with your name
#ClusterName=<somename>
#ControlMachine=<scheduler.local>
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=ern.hpc.rutgers.edu
#NodeName=<node>[1-2] Sockets=2 CoresPerSocket=4 ThreadsPerCore=1 State=UNKNOWN
#PartitionName=DEFAULT DefaultTime=1:00:00 MaxTime=3-0 State=UP
#PartitionName=ern Nodes=<node>[1-2] Default=YES
#FederationParameters=fed_display
#
# Note: the OpenHPC install adds a ine for AccountingStorageType
# comment it out:
##AccountingStorageType=accounting_storage/filetxt
vi /etc/slurm/slurm.conf
systemctl enable slurmctld
systemctl restart slurmctld
# notify Rutgers OARC of the cluster name and scheduler ip address,
# wait for confirmation that the slurmdbd is modified
systemctl restart slurmctld
# send ssh keys
# and usernames to set up in ldap
# and add your information to the ERN google doc
# cephfs
# first, unmount /home. on Centos 7 this requires stopping Network Manager,
# or uncomment from /etc/fstab and reboot
rpm -Uhv http://download.ceph.com/rpm-mimic/el7/noarch/ceph-release-1-1.el7.noarch.rpm
yum clean all
yum update
yum install ceph-common
# get khan.key from Rutgers OARC
cp khan.key /etc/ceph
chmod 400 /etc/ceph/khan.key
# add this line to /etc/fstab:
#khan.hpc.rutgers.edu:6789:/ /home ceph name=admin,secretfile=/etc/ceph/khan.key,_netdev,noatime 0 0
mount /home
ls /home
# add to nfs and export
#/home 192.168.0.0/255.255.0.0(rw,fsid=0)
vi /etc/exports
exportfs -r
exportfs
systemctl enable nfs-server
systemctl start nfs-server
# set up user ssh keys - replace babbott with your username
su - babbott
ssh-keygen
cat .ssh/id_rsa.pub >> .ssh/authorized_keys
chmod 400 .ssh/authorized_keys
ssh <scheduler> “uptime”
exit
# perfsonar
yum -y install http://software.internet2.edu/rpms/el7/x86_64/main/RPMS/perfSONAR-repo-0.8-1.noarch.rpm
yum clean all
yum -y install perfsonar-testpoint
yum -y install perfsonar-toolkit-servicewatcher
yum -y install perfsonar-toolkit-sysctl
yum -y install perfsonar-toolkit-systemenv-testpoint
pscheduler validate-limits
# get mesh config from ern.hpc and the northeast mesh
psconfig remote add --configure-archives "https://mesh.hpc.rutgers.edu/psconfig/ernmesh.json"
psconfig remote add --configure-archives "https://mesh.hpc.rutgers.edu/psconfig/northeastmesh.json"
# reboot the server and the services should start automatically
# ganglia
yum -y install ganglia-gmond-ohpc
cp ~/ern-poc/fedconfig/gmond.conf /etc/ganglia
# change cluster name, udp send host
vi /etc/ganglia/gmond.conf
systemctl enable gmond
systemctl start gmond
# firewall settings
# all from local subnet - replace with yours
firewall-cmd --zone=public --add-rich-rule='rule family="ipv4" source address="f.g.h.i/j" accept'
# ssh, slurm, perfsonar should already be in zone public
# ganglia from central
firewall-cmd --zone=public --add-rich-rule='rule family="ipv4" source address="128.6.226.160/27" port port=8649 protocol=tcp accept'
firewall-cmd --runtime-to-permanent
# singularity
yum -y install singularity-ohpc
# compute nodes
# complete <node1-2> (bare metal and mace keys already done)
# commands run from mace
for i in {1..2};do scp /etc/hosts <node>$i:/etc;done
for i in {1..2};do scp /etc/hosts.allow <node>$i:/etc;done
for i in {1..2};do scp /etc/hosts.deny <node>$i:/etc;done
# only if <scheduler> is gateway, else configure firewall
pdsh -w <node>[1-2] ‘systemctl disable firewalld’
# ldap
pdsh -w <node>[1-2] 'yum -y install sssd openldap-clients'
pdsh -w <node>[1-2] ‘mkdir /etc/openldap/cacerts’
for i in {1..2};do echo <node>$i;scp /etc/openldap/cacerts/ern_hpc_rutgers_edu_interm.cer <node>$i:/etc/openldap/cacerts;ssh <node>$i ‘cacertdir_rehash /etc/openldap/cacerts/’;done
pdsh -w <node>[1-2] 'authconfig --updateall --enableldap --enableldapauth --ldapserver=ldap://ern.hpc.rutgers.edu:389 --ldapbasedn=dc=ufp,dc=hpc,dc=rutgers,dc=edu --enableldaptls --enableldapstarttls'
pdsh -w <node>[1-2] ‘systemctl restart sssd’
pdsh -w <node>[1-2] ‘id babbott’
# slurm
# should be 20003
pdsh -w <node>[1-2] ‘id slurm’
pdsh -w <node>[1-2] ‘yum -y install ohpc-slurm-client ohpc-slurm-server’
for i in {1..2};do scp /etc/munge/munge.key <node>$i:/etc/munge/munge.key;done
pdsh -w <node>[1-2] ‘chown munge:munge /etc/munge/munge.key;systemctl restart munge’
for i in {1..2};do scp /etc/slurm/slurm.conf <node>$i:/etc/slurm;done
pdsh -w <node>[1-2] “systemctl enable slurmd;systemctl restart slurmd”
# nfs mounts - do on each compute node
ssh <node>1
# comment out local /home if present, add nfs home
#<scheduler.local>:/home /home nfs defaults 0 0
vi /etc/fstab
exit
# repeat for <node>2
# back to <scheduler>
pdsh -w <node>[1-2] 'umount /home;mount /home'
# ganglia
pdsh -w <node>[1-2] 'yum -y install ganglia-gmond-ohpc'
for i in {1..2};do echo <node>$i;scp /etc/ganglia/gmond.conf <node>$i:/etc/ganglia;done
pdsh -w <node>[1-2] 'systemctl enable gmond'
pdsh -w <node>[1-2] 'systemctl start gmond'
# confirm user can ssh from <scheduler> to <node1-2> without password
su - babbott
for i in {1..2};do ssh <node>$i “uptime”;done
exit
# singularity
pdsh -w <node>[1-2] 'yum -y install singularity-ohpc'