一、查询ETCD集群
1、定义ETCD集群变量
#ETCD集群IP数组 export ETCD_IPS=(192.168.55.32 192.168.55.33 192.168.55.34) export ETCD_NODE1=192.168.55.32 export ETCD_NODE2=192.168.55.33 export ETCD_NODE3=192.168.55.34 #ETCD集群服务地址列表;注意IP地址根据自己的ETCD集群服务器地址填写 export ETCD_ENDPOINTS="https://192.168.55.32:2379,https://192.168.55.33:2379,https://192.168.55.34:2379"
2、查看ETCD集群状态
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" etcdctl \ --endpoints=https://${node_ip}:2379 \ --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem endpoint health done
3、获取ETCD版本信息
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" etcdctl \ --endpoints=https://${node_ip}:2379 \ --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem version done
二、ETCD数据备份
- 注意:ETCD 不同的版本的 etcdctl 命令不一样,但大致差不多,本文备份使用 napshot save命令备份(ETCD集群leader节点机器上备份):
1、查看ETCD集群的leader
#注意更改为自己集群中证书的存储路径 etcdctl \ -w table --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem \ --endpoints=${ETCD_ENDPOINTS} endpoint status
2、leader节点执行下面操作
mkdir -p /data/etcd_backup_dir etcdctl \ --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem \ --endpoints=https://192.168.55.32:2379 \ snapshot save /data/etcd_backup_dir/etcd-snapshot-`date +%Y%m%d`.db
三、ETCD数据恢复
1、准备工作
- 停止所有 Master 上 kube-apiserver 服务
# 停止kube-apiserver systemctl stop kube-apiserver # 确认 kube-apiserver 服务是否停止 ps -ef | grep kube-apiserver
- 停止集群中所有 ETCD 服务
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" ssh root@${node_ip} "systemctl stop etcd" done
- 移除所有 ETCD 存储目录下数据以及wal日志目录
- 根据自己的集群ETCD服务中配置的存储路径进行修改
mkdir -p /data/etcd_backup_dir mv /data/var/lib/etcd/ /data/etcd_backup_dir/etcd_data_`date +%Y%m%d%H%M%S`
- 拷贝 ETCD 备份快照
# 从 ETCD集群的leader节点机器上拷贝备份到另外2台ETCD节点上 scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.33:/data/etcd_backup_dir/ scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.34:/data/etcd_backup_dir/
2、恢复备份
- 定义ETCD_NODES(在三台MASTER节点上分别执行)
#ETCD集群间通信的IP和端口;注意此处改为自己的实际ETCD服务配置的name export ETCD_NODES="etcd0=https://192.168.55.32:2380,etcd1=https://192.168.55.33:2380,etcd2=https://192.168.55.34:2380" export ETCD_NODE1=192.168.55.32 export ETCD_NODE2=192.168.55.33 export ETCD_NODE3=192.168.55.34
- master1恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.32机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd0 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
- master2恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.33机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd1 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
- master3恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.34机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd2 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
3、启动ETCD
三台 ETCD 都恢复完成后,启动 ETCD
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" ssh root@${node_ip} "chmod 0700 /data/var/lib/etcd" ssh root@${node_ip} "systemctl daemon-reload && systemctl enable etcd && systemctl restart etcd" done
4、检查ETCD
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" etcdctl \ --endpoints=https://${node_ip}:2379 \ --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem endpoint health done
5、启动Apiserver
- 三台 ETCD 全部健康, 启动 kube-apiserver
for node_ip in ${ETCD_IPS[@]} do echo ">>> ${node_ip}" ssh root@${node_ip} "systemctl daemon-reload && systemctl enable kube-apiserver && systemctl restart kube-apiserver" done
6、检查集群
kubectl get cs
CronJob
apiVersion: batch/v1beta1 kind: CronJob metadata: name: etcd-bak spec: schedule: "0 */3 * * *" successfulJobsHistoryLimit: 3 failedJobsHistoryLimit: 5 #activeDeadlineSenconds: 60 jobTemplate: spec: template: metadata: labels: app: etcd-bak spec: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: node-role.kubernetes.io/master operator: In values: - "" containers: - name: etcd image: registry.aliyuncs.com/google_containers/etcd:3.5.1-0 command: - sh - -c - "for ETCD_IP in $ETCD_IPS; do export ETCDCTL_API=3; \ etcdctl --cert=/etc/kubernetes/pki/etcd/server.crt \ --cacert=/etc/kubernetes/pki/etcd/ca.crt \ --key=/etc/kubernetes/pki/etcd/server.key \ --endpoints $ETCD_IP:2379 snapshot save /tmp/etcd-snapshot-$ETCD_IP-$(printf '%(%Y%m%d-%H%M%S)T').db; \ echo etcd backup sucess; done" env: - name: ETCD_IPS value: "10.0.16.14" volumeMounts: - mountPath: "/tmp" name: snapshot # subPath: data/etcd-snapshot - mountPath: /etc/localtime name: lt-config - mountPath: /etc/kubernetes/pki/etcd name: etcd-secret restartPolicy: OnFailure volumes: - name: snapshot hostPath: path: /data/etcd_backup_dir - name: lt-config hostPath: path: /etc/localtime - name: etcd-secret hostPath: path: /etc/kubernetes/pki/etcd hostNetwork: true
三、创建CronJob
修改ETCD_IPS为主节点的IP如"10.0.16.14 10.0.16.15 10.0.16.16"
创建
kubectl apply -f etcd-bak.yaml
执行完成后,会创建一个定时任务,这个定时任务会每隔3小时在主节点的/data/etcd_backup_dir目录下生成文件名etcd-snapshot-节点名-时间.db的备份文件
四、恢复ETCD数据
在etcd故障时,挑选最后备份的数据进行恢复
1、准备工作
- 停止所有 Master 上 kube-apiserver 服务
# 停止kube-apiserver systemctl stop kube-apiserver # 确认 kube-apiserver 服务是否停止 ps -ef | grep kube-apiserver
- 停止集群中所有 ETCD 服务
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" ssh root@${node_ip} "systemctl stop etcd" done
- 移除所有 ETCD 存储目录下数据以及wal日志目录
- 根据自己的集群ETCD服务中配置的存储路径进行修改
mkdir -p /data/etcd_backup_dir mv /data/var/lib/etcd/ /data/etcd_backup_dir/etcd_data_`date +%Y%m%d%H%M%S`
- 拷贝 ETCD 备份快照
# 从 ETCD集群的leader节点机器上拷贝备份到另外2台ETCD节点上 scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.33:/data/etcd_backup_dir/ scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.34:/data/etcd_backup_dir/
2、恢复备份
- 定义ETCD_NODES(在三台MASTER节点上分别执行)
#ETCD集群间通信的IP和端口;注意此处改为自己的实际ETCD服务配置的name export ETCD_NODES="etcd0=https://192.168.55.32:2380,etcd1=https://192.168.55.33:2380,etcd2=https://192.168.55.34:2380" export ETCD_NODE1=192.168.55.32 export ETCD_NODE2=192.168.55.33 export ETCD_NODE3=192.168.55.34
- master1恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.32机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd0 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
- master2恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.33机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd1 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
- master3恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.34机器上操作 ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \ --name etcd2 \ --initial-cluster "${ETCD_NODES}" \ --initial-cluster-token etcd-k8s-cluster \ --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \ --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal
3、启动ETCD
三台 ETCD 都恢复完成后,启动 ETCD
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" ssh root@${node_ip} "chmod 0700 /data/var/lib/etcd" ssh root@${node_ip} "systemctl daemon-reload && systemctl enable etcd && systemctl restart etcd" done
4、检查ETCD
for node_ip in ${ETCD_IPS[@]} do echo ">> ${node_ip}" etcdctl \ --endpoints=https://${node_ip}:2379 \ --cacert=/etc/kubernetes/pki/ca.pem \ --cert=/etc/kubernetes/pki/etcd.pem \ --key=/etc/kubernetes/pki/etcd-key.pem endpoint health done
5、启动Apiserver
- 三台 ETCD 全部健康, 启动 kube-apiserver
for node_ip in ${ETCD_IPS[@]} do echo ">>> ${node_ip}" ssh root@${node_ip} "systemctl daemon-reload && systemctl enable kube-apiserver && systemctl restart kube-apiserver" done
6、检查集群
kubectl get cs