计算节点要想发现cellnode的磁盘,需要在计算节点配置两个文件,所有的计算节点都需要配置
计算节点上的配置文件的子网掩码要和存储节点上的配置文件的子网掩码保持一致
[root@slcz01db03 network-config]# pwd
/etc/oracle/cell/network-config
cellinit.ora里面记录计算节点的ip地址,和cellnode连接的infiniband 的ip地址,共四个或者8个
不要都是ipaddress1,要分别为ipaddress1、2、3、4
错误的写法:
ipaddress1=192.168.64.190/21
ipaddress1=192.168.64.191/21
ipaddress1=192.168.64.192/21
ipaddress1=192.168.64.193/21
[root@slcz01db03 network-config]# cat cellinit.ora
ipaddress1=192.168.64.190/21
ipaddress2=192.168.64.191/21
ipaddress3=192.168.64.192/21
ipaddress4=192.168.64.193/21
#ipaddress2=192.168.64.101/21
#ipaddress3=192.168.64.102/21
#ipaddress4=192.168.64.103/21
_ipcdat_device_list="bondib0"
#_ipcdat_device_list="bondib0,bondib1,bondib2,bondib3"
#_cell_disable_ipcdat_on_client=true
#_cell_enable_ipcdat_on_client=true
cellip.ora记录的是cellnode的ip地址,此处有两个cell node
[root@slcz01db03 network-config]# cat cellip.ora
cell="192.168.64.182"
cell="192.168.64.183"
#cell="192.168.64.184"
[root@scaqai06adm08 network-config]# cat cellinit.ora
_cell_enable_ipcdat_srq=false
#ipaddress2=192.168.41.222/20
_cell_disable_ipcdat_on_client=true
#ipaddress1=192.168.41.221/20
ipaddress1=192.168.41.221/21
ipaddress2=192.168.41.222/21
[root@scaqai06adm08 network-config]# cat cellip.ora
cell="192.168.41.245;192.168.41.246"
cell="192.168.41.247;192.168.41.248"
cell="192.168.41.249;192.168.41.250"
对于computenode上的ib卡使用或者不使用bond的情况
使用bond的情况
[root@slcm05adm01 network-config]# ifconfig -a | grep ib[0-9] -A 1
bondib0: flags=5187<UP,BROADCAST,RUNNING,MASTER,MULTICAST> mtu 65520
inet 192.168.0.1 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib0: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 65520
infiniband 80:00:02:08:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
--
ib1: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 65520
infiniband 80:00:02:09:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
[root@slcm05adm01 network-config]# pwd
/etc/oracle/cell/network-config
[root@slcm05adm01 network-config]# cat cellinit.ora
ipaddress1=192.168.0.1/20
#####################################################
不是用bond的情况
[root@scaqai06adm07 network-config]# ifconfig -a | grep ib[0-9] -A 1
ib0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 65520
inet 192.168.1.57 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 65520
inet 192.168.1.58 netmask 255.255.240.0 broadcast 192.168.15.255
[root@scaqai06adm07 cell]# cd network-config/
[root@scaqai06adm07 network-config]# ls
cellinit.ora cellip.ora
[root@scaqai06adm07 network-config]# cat cellinit.ora
ipaddress1=192.168.1.57/20
ipaddress2=192.168.1.58/20
对于cellnode上使用bond 和不是用bond的情况下computenode的cellip.ora
使用bond的情况
[root@slcm05celadm01 ~]# ifconfig -a | grep ib[0-9] -A 1
bondib0: flags=5187<UP,BROADCAST,RUNNING,MASTER,MULTICAST> mtu 1500
inet 192.168.0.17 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib0: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 1500
infiniband 80:00:02:08:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
--
ib1: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 1500
infiniband 80:00:02:09:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
[root@slcm05adm01 network-config]# pwd
/etc/oracle/cell/network-config
[root@slcm05adm01 network-config]# cat cellip.ora
cell="192.168.0.17"
cell="192.168.0.19"
cell="192.168.0.21"
cell="192.168.0.23"
cell="192.168.0.25"
cell="192.168.0.27"
cell="192.168.0.29"
######################################
不使用bond的情况
[root@scaqai06celadm14 ~]# ifconfig -a | grep ib[0-9] -A 1
ib0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.1.87 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.1.88 netmask 255.255.240.0 broadcast 192.168.15.255
[root@scaqai06adm08 network-config]# cat cellip.ora
cell="192.168.1.83;192.168.1.84"
cell="192.168.1.85;192.168.1.86"
cell="192.168.1.87;192.168.1.88"
[root@scaqai06adm08 network-config]# pwd
/etc/oracle/cell/network-config
对于cellnode上使用bond 和不是用bond的情况下cellnode的cellinit.ora
不使用bond的情况
[root@scaqai06celadm14 ~]# ifconfig -a | grep ib[0-9] -A 1
ib0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.1.87 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.1.88 netmask 255.255.240.0 broadcast 192.168.15.255
[root@scaqai06celadm14 ~]# imageinfo | grep -i '^Active image version'
Active image version: 21.2.11.0.0.220414.1
要选对目录,image应该就是当前active的image
[root@scaqai06celadm14 config]# pwd
/opt/oracle/cell/cellofl-21.2.11.0.0_LINUX.X64_220414.1/cellsrv/deploy/config
[root@scaqai06celadm14 config]# cat cellinit.ora
#CELL Initialization Parameters
_cell_ramcache_mode=On
ipaddress2=192.168.1.88/20
ipaddress1=192.168.1.87/20
使用bond的情况
#############################################################
[root@slcm05celadm01 config]# ifconfig -a | grep -i ib[0-9] -A 1
bondib0: flags=5187<UP,BROADCAST,RUNNING,MASTER,MULTICAST> mtu 1500
inet 192.168.0.17 netmask 255.255.240.0 broadcast 192.168.15.255
--
ib0: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 1500
infiniband 80:00:02:08:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
--
ib1: flags=6211<UP,BROADCAST,RUNNING,SLAVE,MULTICAST> mtu 1500
infiniband 80:00:02:09:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
[root@slcm05celadm01 cell]# imageinfo | grep -i '^Active image version'
Active image version: 22.1.90.0.0.220331
注意配置文件的路径一定要找对,一定是当前active的image version
[root@slcm05celadm01 config]# pwd
/opt/oracle/cell/cellofl-22.1.90.0.0_LINUX.X64_220331/cellsrv/deploy/config
[root@slcm05celadm01 config]# cat cellinit.ora
#CELL Initialization Parameters
_cell_ramcache_mode=On
ipaddress1=192.168.0.17/20
cell节点上的配置文件,cellinit.ora里面记录的是cell node的ib卡的ip地址
[root@scaqad02celadm01 config]# pwd
/opt/oracle/cell20.2.0.0.0_LINUX.X64_200810/cellsrv/deploy/config
[root@scaqad02celadm01 config]# cat cellinit.ora
#CELL Initialization Parameters
_cell_ramcache_mode=On
ipaddress2=192.168.0.18/20
ipaddress1=192.168.0.17/20
_cell_fc_persistence_state=WriteBack
#_cell_enable_buffer_hist = true
#_cell_server_event="trace[CELLSRV_Disk_layer.*] disk=highest, memory=highest"
#_cell_server_event="trace[CELL_Block_Server.*] memory=highest"
#_cell_server_event="trace[CELLSRV_IO_Layer.*] memory=highest"
#_cell_server_event="trace[cellsrv_disk_layer.*] memory=highest"
#_cell_server_event="trace[CELL_Block_Server.*] memory=medium"
#_cell_server_event="trace[CELLSRV_IO_Layer.*] memory=highest"
#_cell_server_event="trace[cellsrv_disk_layer.*] memory=highest"
#_cell_server_event="trace[cellsrv_flash_cache_layer.*] memory=highest"
[root@scaqad02celadm01 config]# ifconfig -a | grep ib -A 1
Infiniband hardware address can be incorrect! Please read BUGS section in ifconfig(8).
Infiniband hardware address can be incorrect! Please read BUGS section in ifconfig(8).
ib0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.0.17 netmask 255.255.240.0 broadcast 192.168.15.255
infiniband 80:00:02:08:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
RX packets 920832 bytes 125597136 (119.7 MiB)
--
ib1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 2044
inet 192.168.0.18 netmask 255.255.240.0 broadcast 192.168.15.255
infiniband 80:00:02:09:FE:80:00:00:00:00:00:00:00:00:00:00:00:00:00:00 txqueuelen 256 (InfiniBand)
RX packets 500222 bytes 31498120 (30.0 MiB)
然后使用kfod.bin来查看是否能找到磁盘,kfod.bin可以在没有安装gi的情况下运行
[root@slcz01db03 bin]# pwd
/u01/u01/app/12.1.0/grid/bin
[root@slcz01db03 bin]# set | grep ORACLE_HOME
ORACLE_HOME=/u01/u01/app/12.1.0/grid
[root@slcz01db03 bin]# export LD_LIBRARY_PATH=/u01/u01/app/12.1.0/grid/lib
[crsusr@slcz01db03 bin]$ ./kfod.bin
Error 1 initializing CRS infrastructure
--------------------------------------------------------------------------------
Disk Size Path User Group
================================================================================
1: 409600 MB o/192.168.64.182/dwgz_CD_00_slcc04cel07
2: 409600 MB o/192.168.64.182/dwgz_CD_01_slcc04cel07
3: 409600 MB o/192.168.64.182/dwgz_CD_02_slcc04cel07
4: 409600 MB o/192.168.64.182/dwgz_CD_03_slcc04cel07
5: 409600 MB o/192.168.64.182/dwgz_CD_04_slcc04cel07
6: 409600 MB o/192.168.64.182/dwgz_CD_05_slcc04cel07
7: 409600 MB o/192.168.64.182/dwgz_CD_06_slcc04cel07
8: 409600 MB o/192.168.64.182/dwgz_CD_07_slcc04cel07
9: 409600 MB o/192.168.64.182/dwgz_CD_08_slcc04cel07
10: 409600 MB o/192.168.64.182/dwgz_CD_09_slcc04cel07
11: 409600 MB o/192.168.64.182/dwgz_CD_10_slcc04cel07
12: 409600 MB o/192.168.64.182/dwgz_CD_11_slcc04cel07
13: 409600 MB o/192.168.64.183/dwgz_CD_00_slcc04cel08
14: 409600 MB o/192.168.64.183/dwgz_CD_01_slcc04cel08
15: 409600 MB o/192.168.64.183/dwgz_CD_02_slcc04cel08
16: 409600 MB o/192.168.64.183/dwgz_CD_03_slcc04cel08
17: 409600 MB o/192.168.64.183/dwgz_CD_04_slcc04cel08
18: 409600 MB o/192.168.64.183/dwgz_CD_05_slcc04cel08
19: 409600 MB o/192.168.64.183/dwgz_CD_06_slcc04cel08
20: 409600 MB o/192.168.64.183/dwgz_CD_07_slcc04cel08
21: 409600 MB o/192.168.64.183/dwgz_CD_08_slcc04cel08
22: 409600 MB o/192.168.64.183/dwgz_CD_09_slcc04cel08
23: 409600 MB o/192.168.64.183/dwgz_CD_10_slcc04cel08
24: 409600 MB o/192.168.64.183/dwgz_CD_11_slcc04cel08
KFOD-00301: Unable to contact Cluster Synchronization Services (CSS). Return code 2 from kgxgncin.
上面这行不是错误,是因为没有安装gi
[root@scaqai06adm08 tyl]# rpm -ivh exadata-dbmmgmt-20.2.0.0.0.200505-1.noarch.rpm
Preparing... ################################# [100%]
2020-07-19 04:17:25 -0700: Pre Installation steps in progress ...
2020-07-19 04:17:27 -0700: This is a fresh install.
Updating / installing...
1:exadata-dbmmgmt-20.2.0.0.0.200505################################# [100%]
2020-07-19 04:17:35 -0700: Post Installation steps in progress ...
Starting MS...
Importing snmp suscriber from compmon service...
Successfully imported snmp subscribers.
Installation SUCCESSFUL.
Done. Please Login as user dbmadmin.
在清环境的时候由于不知道exadata-dbmmgmt-20.2.0.0.0.200505-1.noarch.rpm这个rpm是干什么用的,原来以为只是exascale用,就将之删除了 rpm -e
导致出现下面的错误
./kfod.bin
Error 1 initializing CRS infrastructure
KFOD-00302: Error encountered in device access layer: OSS Operation oss_initialize failed with error 149 [Unable to load lib ISAL]
最近刚碰到的另外一个导致compute node无法发现griddisk的原因,应该是exascale残留的配置文件导致的,删除exascale配置文件就解决了
[root@scaqai06adm08 bin]# ./kfod.bin disks=all
--------------------------------------------------------------------------------
ORACLE_SID ORACLE_HOME
================================================================================
需要删除/etc/oracle/cell/network-config/下的和exascacle相关的配置文件
[root@scaqai06adm08 network-config]# ls
cellinit.ora cellip.ora cellroute.ora egsip.ora egsip.ora-orig eswallet
[root@scaqai06adm08 network-config]# rm -rf egs*
[root@scaqai06adm08 network-config]# rm -rf eswallet/
[root@scaqai06adm08 network-config]# ls
cellinit.ora cellip.ora cellroute.ora
不知道为什么cellroute.ora 文件总是被清空?
[root@scaqai06adm08 network-config]# cat cellroute.ora
# Routes for 192.168.41.245;192.168.41.246
route="192.168.41.245;192.168.41.221"
route="192.168.41.246;192.168.41.222"
# Routes for 192.168.41.247;192.168.41.248
route="192.168.41.247;192.168.41.221"
route="192.168.41.248;192.168.41.222"
# Routes for 192.168.41.249;192.168.41.250
route="192.168.41.249;192.168.41.221"
route="192.168.41.250;192.168.41.222"