一、环境准备
二、MGR故障切换测试
2.1 primary节点mysql服务故障
关闭primary节点的mysql服务,模拟服务故障,验证该节点被驱除出集群后vip自动切换能力和业务自愈能力
2.1.1 查看VIP服务和MGR状态
[root@mgrserver01 HAIPMGR-master]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:3f:54:04 brd ff:ff:ff:ff:ff:ff
inet 192.168.111.30/24 brd 192.168.111.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
inet 192.168.111.33/24 brd 192.168.111.255 scope global secondary eth0:3307
valid_lft forever preferred_lft forever
root@localhost: 14:23: [(none)]> SELECT * FROM performance_schema.replication_group_members;
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| CHANNEL_NAME | MEMBER_ID | MEMBER_HOST | MEMBER_PORT | MEMBER_STATE | MEMBER_ROLE | MEMBER_VERSION | MEMBER_COMMUNICATION_STACK |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| group_replication_applier | 2c7bd415-c173-11ee-a4d1-000c293f5404 | mgrserver01 | 3307 | ONLINE | PRIMARY | 8.0.35 | XCom |
| group_replication_applier | 33e4f69b-c173-11ee-9a5d-000c29f132d9 | mgrserver03 | 3307 | ONLINE | SECONDARY | 8.0.35 | XCom |
| group_replication_applier | 3fc65174-c173-11ee-94aa-000c29c1073f | mgrserver02 | 3307 | ONLINE | SECONDARY | 8.0.35 | XCom |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
3 rows in set (0.03 sec)
2.1.2 mgrmaster01停数据库服务
[root@mgrserver01 HAIPMGR-master]# /data/mysql8.0.35/install/mysql-8.0.35/bin/mysqladmin --defaults-file=/data/mysql8.0.35/3307/conf/my.cnf -uroot -p shutdown
2.1.3 再次VIP服务和MGR状态
[root@mgrserver01 HAIPMGR-master]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:3f:54:04 brd ff:ff:ff:ff:ff:ff
inet 192.168.111.30/24 brd 192.168.111.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
mgrserver03成为主库
root@localhost: 14:25: [(none)]> SELECT * FROM performance_schema.replication_group_members;
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| CHANNEL_NAME | MEMBER_ID | MEMBER_HOST | MEMBER_PORT | MEMBER_STATE | MEMBER_ROLE | MEMBER_VERSION | MEMBER_COMMUNICATION_STACK |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| group_replication_applier | 33e4f69b-c173-11ee-9a5d-000c29f132d9 | mgrserver03 | 3307 | ONLINE | PRIMARY | 8.0.35 | XCom |
| group_replication_applier | 3fc65174-c173-11ee-94aa-000c29c1073f | mgrserver02 | 3307 | ONLINE | SECONDARY | 8.0.35 | XCom |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
[root@mgrserver03 HAIPMGR-master]# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
link/ether 00:0c:29:f1:32:d9 brd ff:ff:ff:ff:ff:ff
inet 192.168.111.32/24 brd 192.168.111.255 scope global noprefixroute eth0
valid_lft forever preferred_lft forever
inet 192.168.111.33/24 brd 192.168.111.255 scope global secondary eth0:3307
valid_lft forever preferred_lft forever
2.1.4 将mgrserver01数据库服务拉起并加入集群
[root@mgrserver01 HAIPMGR-master]# nohup /data/mysql8.0.35/install/mysql-8.0.35/bin/mysqld_safe --defaults-file=/data/mysql8.0.35/3307/conf/my.cnf &
[root@mgrserver01 HAIPMGR-master]# /data/mysql8.0.35/install/mysql-8.0.35/bin/mysql --defaults-file=/data/mysql8.0.35/3307/conf/my.cnf -uroot -p'r2_#y)mg&OV3'
mysql: [Warning] Using a password on the command line interface can be insecure.
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 44
Server version: 8.0.35 MySQL Community Server - GPL
Copyright (c) 2000, 2023, Oracle and/or its affiliates.
Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
root@localhost: 15:01: [(none)]> START GROUP_REPLICATION;
Query OK, 0 rows affected, 1 warning (8.18 sec)
root@localhost: 15:02: [(none)]> SELECT * FROM performance_schema.replication_group_members;
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| CHANNEL_NAME | MEMBER_ID | MEMBER_HOST | MEMBER_PORT | MEMBER_STATE | MEMBER_ROLE | MEMBER_VERSION | MEMBER_COMMUNICATION_STACK |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
| group_replication_applier | 2c7bd415-c173-11ee-a4d1-000c293f5404 | mgrserver01 | 3307 | ONLINE | SECONDARY | 8.0.35 | XCom |
| group_replication_applier | 33e4f69b-c173-11ee-9a5d-000c29f132d9 | mgrserver03 | 3307 | ONLINE | PRIMARY | 8.0.35 | XCom |
| group_replication_applier | 3fc65174-c173-11ee-94aa-000c29c1073f | mgrserver02 | 3307 | ONLINE | SECONDARY | 8.0.35 | XCom |
+---------------------------+--------------------------------------+-------------+-------------+--------------+-------------+----------------+----------------------------+
3 rows in set (0.00 sec)
2.2 primary节点系统故障
执行reboot操作,模拟系统故障后,vip是否自动切换,服务启动后,节点重新加入集群是否正常
2.3 secondary节点故障
同上,模拟secondary节点系统故障和mysql服务故障后的vip自动切换能力,以及节点是否可以正常加入集群
三、逃生方案
3.1 背景
为了避免网络异常造成集群不可用,或者重大故障导致集群异常时,DBA能够快速回复业务的可用性。
以下是逃生方案的一些思路
3.2 尝试重组MGR
启动mgr实例,检查3个节点的gtid集合。
3.2.1 选择gtid最大的节点,执行下面操作
set global group_replication_bootstrap_group=on;
start group_replication;
set global group_replication_bootstrap_group=off;
select * from performance_schema.replication_group_members;
3.2.2 另外两个节点执行以下操作
start group_replication;
select * from performance_schema.replication_group_members;
3.3 单实例模式运行
重组mgr失败后,选择GTID集最大的节点,关闭只读模式,手动挂载VIP,然后提供服务。
3.4 数据恢复
如果mysql服务无法正常启动时,考虑备份恢复。