# 升级xCAT # BMC不用设置网关,要在networks表中去掉网关信息
cd /etc/yum.repos.d/
rm lenovo-hpc.repo -f
cd
tar xjvf xcat-2.13.8_POST140_confluent-1.7.2_dev29_lenovo-confluent-0.7.1_post35-el7.tar.bz2
cd lenovo-hpc-el7/
bash mklocalrepo.sh
yum clean all
yum update -y
yum install confluent* -y (如果原来没有安装confluent的话)
yum install lenovo* -y (如果原来没有安装confluent的话)
grep i01r05 production-hosts.txt (依实际柜编号查找)
chtab key=domain site.value=sng.lrz.de
ip a
chtab key=master site.value=mgt33 (按实际管理节点处理 )
chtab key=forwarders site.value=172.16.131.254 (这个依实际配置,不同的island有不同的网络 )
chtab key=nameservers site.value=172.16.131.254 (理由如上一要)
chtab key=dhcpinterfaces site.value=eno1 (依实际处理)
chtab key=consoleservice site.value=confluent
#在在行配置前,建议重置默认的表(默认模板/opt/xcat/share/xcat/templates/e1350/ )
#当节点发现完成后,不用重启机器,将IMM设置成shared模式
[root@mgt33 ~]# pscp asu64 all:/root/
[root@mgt33 ~]# pping bmc|xcoll
====================================
lrzbmc
====================================
noping
[root@mgt33 ~]# nodeshell all /root/asu64 set IMM.SharedNicMode "Shared" --kcs
[root@mgt33 ~]# pping bmc|xcoll
====================================
lrzbmc
====================================
ping
91 ./asu64 showvalues imm.sharednicmode --kcs
92 ./asu64 set imm.sharednicmode dedicated --kcs
[root@mgt33 ~]# grep fpc production-hosts.txt | grep i01r05
172.16.15.25 i01r05c01fpc.sng.lrz.de i01r05c01fpc
172.16.15.26 i01r05c02fpc.sng.lrz.de i01r05c02fpc
172.16.15.27 i01r05c03fpc.sng.lrz.de i01r05c03fpc
172.16.15.28 i01r05c04fpc.sng.lrz.de i01r05c04fpc
172.16.15.29 i01r05c05fpc.sng.lrz.de i01r05c05fpc
172.16.15.30 i01r05c06fpc.sng.lrz.de i01r05c06fpc
nodeadd i01r05c0[1-6]fpc groups=fpc
chdef -t group fpc mgt=ipmi bmcpassword=PASSW0RD bmcusername=USERID cons=ipmi nodetype=fpc
chdef -t group fpc bmc='/\z//' ip='|i01r05c(\d+).*$|172.16.15.($1+24)|'
[root@mgt33 ~]# tabdump switch
#node,switch,port,vlan,interface,comments,disable
"36perswitch","|\w+-(\d+).*$|switch(($1-1)/42+1)|","|\w+-(\d+).*$|(($1-1)%42+1)|",,,,
"36perswitch1","|\w+-(\d+).*$|switch(($1-1)/42+2)|","|\w+-(\d+).*$|(($1-1)%42+1)|",,,,
"fpc","|i01r05c(\d+).*$|i01r05s(($1-1)/3+21)eth|","|i01r05c(\d+).*$|((($1-1)%3)+37)|",,,,
"lrzswitch","|i01r05c(\d+)s.*$|i01r05s(($1-1)/3+21)eth|","|i01r05c(\d+)s(\d+).*$|((($1-1)%3)*12+$2)|",,,,
[root@mgt33 ~]# lsdef i01r05c01fpc
Object name: i01r05c01fpc
bmc=i01r05c01fpc
bmcpassword=PASSW0RD
bmcusername=USERID
cons=ipmi
groups=fpc
ip=172.16.15.25
mgt=ipmi
nodetype=fpc
postbootscripts=otherpkgs
postscripts=syslog,remoteshell,syncfiles
switch=i01r05s21eth
switchport=37
[root@mgt33 ~]# configfpc -i eno1
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:d0
Configured FPC with MAC ec:aa:a0:19:87:d0 as i01r05c05fpc (172.16.15.29)
Verified the FPC with MAC ec:aa:a0:19:87:d0 is responding to the new IP 172.16.15.29 as node i01r05c05fpc
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:8c
Configured FPC with MAC ec:aa:a0:19:87:8c as i01r05c04fpc (172.16.15.28)
Verified the FPC with MAC ec:aa:a0:19:87:8c is responding to the new IP 172.16.15.28 as node i01r05c04fpc
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:e5
Configured FPC with MAC ec:aa:a0:19:87:e5 as i01r05c06fpc (172.16.15.30)
Verified the FPC with MAC ec:aa:a0:19:87:e5 is responding to the new IP 172.16.15.30 as node i01r05c06fpc
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:87
Configured FPC with MAC ec:aa:a0:19:87:87 as i01r05c02fpc (172.16.15.26)
Verified the FPC with MAC ec:aa:a0:19:87:87 is responding to the new IP 172.16.15.26 as node i01r05c02fpc
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:d5
Configured FPC with MAC ec:aa:a0:19:87:d5 as i01r05c03fpc (172.16.15.27)
Verified the FPC with MAC ec:aa:a0:19:87:d5 is responding to the new IP 172.16.15.27 as node i01r05c03fpc
Found IP 192.168.0.100 and MAC ec:aa:a0:19:87:e8
Configured FPC with MAC ec:aa:a0:19:87:e8 as i01r05c01fpc (172.16.15.25)
Verified the FPC with MAC ec:aa:a0:19:87:e8 is responding to the new IP 172.16.15.25 as node i01r05c01fpc
There are no more default IP address to process
[root@mgt33 ~]# lsdef -t osimage -z rhels7.3-x86_64-netboot-compute |sed 's/^[^ ]\+:/opa:/' |mkdef -z
1 object definitions have been created or modified.
[root@mgt33 ~]# mkdir -p /install/netboot/rhels7.3/x86_64/opa
[root@mgt33 ~]# chdef -t osimage opa rootimgdir=/install/netboot/rhels7.3/x86_64/opa
1 object definitions have been created or modified.
[root@Rack_TestData ~]# scp /tmp/myopa74_20180312.tgz mgt33:/root/
myopa74_20180312.tgz 71% 674MB 113.1MB/s 00:02 ETA
preload opa osimage
nodeset all osimage=opa
rsetboot all net -u
rpower all reset
[root@mgt33 ~]# psh i01r05c0[1-2]s[01-12] opainfo | grep State |xcoll
====================================
i01r05c01s06,i01r05c02s11,i01r05c02s01,i01r05c02s06,i01r05c02s08,i01r05c01s07,i01r05c01s04,i01r05c01s08,i01r05c01s05,i01r05c01s02,i01r05c02s07,i01r05c02s09,i01r05c01s01,i01r05c01s10,i01r05c01s12,i01r05c02s02,i01r05c02s12,i01r05c01s11,i01r05c02s05,i01r05c01s03,i01r05c02s04,i01r05c02s10,i01r05c02s03,i01r05c01s09
====================================
PortState: Init (LinkUp)
[root@mgt33 ~]# ssh i01r05c01s01
Last login: Tue Mar 13 11:27:06 2018 from 172.16.131.254
[root@i01r05c01s01 ~]# service opafm start
Redirecting to /bin/systemctl start opafm.service
[root@i01r05c01s01 ~]# exit
[root@mgt33 ~]# psh i01r05c0[1-2]s[01-12] opainfo | grep State |xcoll
====================================
i01r05c01s06,i01r05c02s11,i01r05c02s01,i01r05c02s06,i01r05c02s08,i01r05c01s07,i01r05c01s04,i01r05c01s08,i01r05c01s05,i01r05c01s02,i01r05c02s07,i01r05c02s09,i01r05c01s01,i01r05c01s10,i01r05c01s12,i01r05c02s02,i01r05c02s12,i01r05c01s11,i01r05c02s05,i01r05c01s03,i01r05c02s04,i01r05c02s10,i01r05c02s03,i01r05c01s09
====================================
PortState: Active
[root@mgt33 ~]# cat update_opa_lrz.sh
nodeshell all hfi1_eprom -w -o /usr/share/opa/bios_images/HfiPcieGen3Loader_1.6.0.0.0.rom
nodeshell all hfi1_eprom -w -b /usr/share/opa/bios_images/HfiPcieGen3_1.6.0.0.0.efi
nodeshell all hfi1_eprom -w -c /lib/firmware/updates/hfi1_platform.dat
nodeshell all opatmmtool -f /lib/firmware/updates/hfi1_smbus.fw update
[root@node04 ~]# opatmmtool
Current Firmware Version=10.4.0.0.146
[root@node04 ~]# hfi1_eprom -V -o
Using device: /sys/bus/pci/devices/0000:06:00.0/resource0
loader file version: 1.6.0.0.0
[root@node04 ~]# hfi1_eprom -V -b
Using device: /sys/bus/pci/devices/0000:06:00.0/resource0
driver file version: 1.6.0.0.0
[root@node04 ~]# hfi1_eprom -V -c
Using device: /sys/bus/pci/devices/0000:06:00.0/resource0
config file version: HFI_TYPE1 v1.0.1.0
[root@node04 ~]#
[root@mgt33 ~]# md5sum step1.asu step2.asu
bec97fb914bd8822199939a045ff6d8d step1.asu
73bec0ae708caf530d42407d71ab3947 step2.asu
[root@mgt33 ~]# pasu i01r05c04s[11-12] set Processors.TDP 240 --override
i01r05c04s12: Processors.TDP=240
i01r05c04s12: Waiting for command completion status.
i01r05c04s11: Processors.TDP=240
i01r05c04s11: Waiting for command completion status.
i01r05c04s12: Command completed successfully.
i01r05c04s11: Command completed successfully.
[root@mgt33 ~]# md5sum python_tools/LRZ/sys-health-check /install/johnw/sys-health-check
c410fcacd669ca6b798135d79f40c2d8 python_tools/LRZ/sys-health-check
c410fcacd669ca6b798135d79f40c2d8 /install/johnw/sys-health-check
[root@mgt33 ~]# /install/johnw/sys-health-check all | tee /tmp/health-check-201803142357.log
====================================
compute
====================================
Expected Actual Status Description
----------- ------ ------ -----------
1.50 1.50 OK Node XCC Firmware Version
OTE106I OTE106I OK Node UEFI Firmware Version
VT-UTF8 VT-UTF8 OK DevicesandIOPorts.Com1TerminalEmulation
Enable Enable OK DevicesandIOPorts.Com1ActiveAfterBoot
Hardware Hardware OK DevicesandIOPorts.Com1FlowControl
Cooperative Cooperative OK Processors.CPUPstateControl
Disable Disable OK Processors.SNC
UEFI UEFI OK EnableDisableAdapterOptionROMSupport.OnboardVideo
None None OK DiskGPTRecovery.DiskGPTRecovery
Custom Mode Custom Mode OK OperatingModes.ChooseOperatingMode
240W 240W OK Processor Power
3.10.0-693 3.10.0-693 OK Linux Kernel Version
96@powersave 96@powersave OK CPU scaling_govenor
96@1200000 96@1200000 OK CPU scaling_min_freq
96@3900000 96@3900000 OK CPU scaling_max_freq
94GB 94GB OK MEMORY Total
[always] madvise never [always] madvise never OK MEMORY Huge Transparent Pages
1 1 OK OPA Ports visible to Operating System
(rev 11) (rev 11) OK OPA Card Level(s)
x16 x16 OK OPA PCI Lanes
10.6.1.0.2 10.6.1.0.2 OK OPA Driver Version
Active Active OK OPA Link Status
4 4 OK OPA Tx LaneWidth
4 4 OK OPA Rx LaneWidth
5 (Excellent) 5 (Excellent) OK OPA Link Quality
datagram datagram OK OPA mode
4092 4092 OK OPA MTU
128 128 OK OPA Send Queue Size
256 256 OK OPA Receive Queue Size
v1.0.1.0 v1.0.1.0 OK OPA Adapter Config Version
1.6.0.0.0 1.6.0.0.0 OK OPA Adapter Loader Version
1.6.0.0.0 1.6.0.0.0 OK OPA Adapter Config Version
10.4.0.0.146 10.4.0.0.146 OK OPA Adapter TMM Version
[root@mgt33 ~]#
102144 Ns
1 # of NBs
384 NBs
0 PMAP process mapping (0=Row-,1=Column-major)
1 # of process grids (P x Q)
1 Ps
1 Qs
[root@mgt33 ~]# md5sum peter/cluster/HPL.dat.1n
8aa31f171cd8b23509132c065e38da0d peter/cluster/HPL.dat.1n
[root@mgt33 ~]# pscp peter/cluster/HPL.dat.1n all:/root/peter/HPL.dat
[root@mgt33 ~]# cat peter/lrz_single.sh
#!/bin/bash
#
mount | grep mgt | grep install &>/dev/null
if [ $? -eq 0 ];then
echo -e "mgt is mount"
else
mount mgt33:/install /install
fi
timefmt=`date +%Y%m%d%H%M%s`
mkdir -p /install/mgt33
#mount mgt33:/install /install
/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/benchmarks/mp_linpack/xhpl_intel64_static | tee /install/mgt33/single-$(hostname -s)-${timefmt}.log
#
[root@mgt33 ~]#
[root@mgt33 ~]# pscp peter/lrz_single.sh all:/root/peter/
i01r05c01s01: done
[root@mgt33 ~]# nodeshell i01r05c[01-02]s[01-12] "cd /root/peter; bash lrz_single.sh"
for a in `seq 1 100`; do timefmt=`date +%Y%m%d%H%M%S`; mkdir -p /install/mgt33; nodesensors i01r05c[01-02]s[01-12] >/install/mgt33/sensors_${timefmt}.log; sleep 60; done
[root@mgt33 ~]# psh all opainfo | grep State | xcoll
====================================
compute
====================================
PortState: Active
生成hostfile
复制到指定节点的:/root/peter/cluster下面
将/root/peter/HPL.dat.xn(注意:HPL.dat.36node是给768G FAT NODE用的,96G的Thin node 要用HPL.dat.36n)复制到跑linpack节点的:/root/peter/cluster/HPL.dat
进入某节点/root/peter/cluster/下跑对应的脚本,如bash 36node.sh
[root@mgt33 ~]# psh i01r05c[01-06]s01 service opafm start
i01r05c02s01: Redirecting to /bin/systemctl start opafm.service
i01r05c03s01: Redirecting to /bin/systemctl start opafm.service
i01r05c01s01: Redirecting to /bin/systemctl start opafm.service
i01r05c05s01: Redirecting to /bin/systemctl start opafm.service
i01r05c06s01: Redirecting to /bin/systemctl start opafm.service
i01r05c04s01: Redirecting to /bin/systemctl start opafm.service
[root@mgt33 mgt33]# nodelist i01r05c0[1-3]s[01-12] >hostfile
[root@mgt33 mgt33]# pscp hostfile i01r05c01s01:/root/peter/cluster/
i01r05c01s01: done
[root@mgt33 mgt33]# nodelist i01r05c0[4-6]s[01-12] >hostfile
[root@mgt33 mgt33]# pscp hostfile i01r05c04s01:/root/peter/cluster/
[root@mgt33 ~]# pscp peter/cluster/HPL.dat.36n all:/root/peter/cluster/HPL.dat
[root@mgt33 ~]# ssh i01r05c01s01
[root@i01r05c01s01 ~]# cd peter/cluster/
[root@i01r05c01s01 cluster]# cat hostfile
[root@i01r05c01s01 cluster]#bash 36node.sh
power cycle测试 数据收集
[root@mgt33 ~]# ls HPL
HPL.dat parse.sh run.sh single_run.sh skl3_HPL_2018-03-12_17-23 xhpl
[root@mgt33 ~]# scp -r HPL python_tools/
[root@mgt33 ~]# cat HPL/single_run.sh
#!/bin/bash
hostname=`hostname -s`
fmt=`date +%Y%m%d%H%M%S`
./xhpl | tee /install/mgt33/single_hpl/single_${hostname}_${fmt}.log
[root@mgt33 ~]# pscp -r HPL all:/root/
[root@mgt33 ~]# psh all mount mgt33:/install /install
[root@mgt33 ~]# ls /install/mgt33/single_hpl/
[root@mgt33 ~]# psh i01r05c0[1-3]s[01-12] "cd /root/HPL; bash single_run.sh"
[root@mgt33 ~]# grep WR /install/mgt33/single_hpl/single_i01r05c0* |sort -k7 -n
/install/mgt33/single_hpl/single_i01r05c01s12_20180314202740.log:WR00L2L4 103296 384 1 1 260.48 2.82091e+03
/install/mgt33/single_hpl/single_i01r05c02s10_20180314202744.log:WR00L2L4 103296 384 1 1 253.41 2.89959e+03
/install/mgt33/single_hpl/single_i01r05c03s04_20180314202738.log:WR00L2L4 103296 384 1 1 248.33 2.95899e+03
/install/mgt33/single_hpl/single_i01r05c03s11_20180314202742.log:WR00L2L4 103296 384 1 1 248.15 2.96114e+03
/install/mgt33/single_hpl/single_i01r05c02s02_20180314202742.log:WR00L2L4 103296 384 1 1 246.98 2.97517e+03
/install/mgt33/single_hpl/single_i01r05c01s08_20180314202742.log:WR00L2L4 103296 384 1 1 246.49 2.98107e+03
/install/mgt33/single_hpl/single_i01r05c01s10_20180314202742.log:WR00L2L4 103296 384 1 1 245.63 2.99146e+03
/install/mgt33/single_hpl/single_i01r05c01s09_20180314202742.log:WR00L2L4 103296 384 1 1 242.60 3.02889e+03
/install/mgt33/single_hpl/single_i01r05c03s12_20180314202743.log:WR00L2L4 103296 384 1 1 242.56 3.02937e+03
/install/mgt33/single_hpl/single_i01r05c01s04_20180314202743.log:WR00L2L4 103296 384 1 1 240.56 3.05459e+03
/install/mgt33/single_hpl/single_i01r05c02s05_20180314202741.log:WR00L2L4 103296 384 1 1 239.26 3.07110e+03
/install/mgt33/single_hpl/single_i01r05c02s11_20180314202741.log:WR00L2L4 103296 384 1 1 234.09 3.13903e+03
/install/mgt33/single_hpl/single_i01r05c03s02_20180314202743.log:WR00L2L4 103296 384 1 1 233.00 3.15364e+03
1108 pscp xhpl all:/root/peter/cluster/
1097 pscp HPL.dat.36n all:/root/peter/cluster/
1098 nodelist i01r05c[01-03]s[01-12]
1099 nodelist i01r05c[01-03]s[01-12] >hostfile
1100 pscp hostfile i01r05c[01-03]s[01-12]:/root/peter/cluster/
1101 nodelist i01r05c[04-06]s[01-12] >hostfile
1102 pscp hostfile i01r05c[04-06]s[01-12]:/root/peter/cluster/
[root@mgt33 cluster]# pscp HPL.dat.36n all:/root/peter/cluster/HPL.dat
[root@i01r05c01s01 cluster]# cat n36.sh
#!/bin/bash
source ./tools_cluster.sh
# . /root/peter/cluster/tools_cluster.sh
fm=`date +%Y%m%d%H%M`
echo -e " make sure all OPA port is up and running "
#echo -e "ln -s /usr/lib/systemd/system/opafm.service /etc/systemd/system/multi-user.target.wants/opafm.service"
# systemctl restart opafm.service
# opainfo
# opafabricinfo
#
mkdir -p /install/tmp/
echo -e "Test log is store in the /install/tmp/linpack_${fm}.log"
echo -e "\n\n***********************************************************************************************\n"
mpirun -genvall -f hostfile -np 36 -ppn 1 /root/peter/cluster/xhpl | tee /install/mgt33/36node-linpack_${fm}.log
#
echo -e "***********************************************************************************************"
echo -e "\n\n\nTest log is store in the /install/tmp/linpack_${fm}.log\n\n\n"
#
#
[root@i01r05c01s01 cluster]#
[root@i01r05c01s01 cluster]# pwd
/root/peter/cluster
[root@i01r05c01s01 cluster]# bash n36.sh
[root@mgt33 HPL]# ssh i01r05c04s12
[root@i01r05c04s12 ~]# cd peter/cluster/
[root@i01r05c04s12 cluster]# cat hostfile
[root@i01r05c04s12 cluster]#bash n36.sh