ETCD数据的备份和恢复

一、查询ETCD集群

1、定义ETCD集群变量

#ETCD集群IP数组
export ETCD_IPS=(192.168.55.32 192.168.55.33 192.168.55.34)
export ETCD_NODE1=192.168.55.32
export ETCD_NODE2=192.168.55.33
export ETCD_NODE3=192.168.55.34
 
 
#ETCD集群服务地址列表;注意IP地址根据自己的ETCD集群服务器地址填写
export ETCD_ENDPOINTS="https://192.168.55.32:2379,https://192.168.55.33:2379,https://192.168.55.34:2379"

2、查看ETCD集群状态

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    etcdctl \
    --endpoints=https://${node_ip}:2379 \
    --cacert=/etc/kubernetes/pki/ca.pem \
    --cert=/etc/kubernetes/pki/etcd.pem \
    --key=/etc/kubernetes/pki/etcd-key.pem endpoint health
  done

3、获取ETCD版本信息

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    etcdctl \
    --endpoints=https://${node_ip}:2379 \
    --cacert=/etc/kubernetes/pki/ca.pem \
    --cert=/etc/kubernetes/pki/etcd.pem \
    --key=/etc/kubernetes/pki/etcd-key.pem version
  done

二、ETCD数据备份

  • 注意:ETCD 不同的版本的 etcdctl 命令不一样,但大致差不多,本文备份使用 napshot save命令备份(ETCD集群leader节点机器上备份):

1、查看ETCD集群的leader

#注意更改为自己集群中证书的存储路径
etcdctl \
  -w table --cacert=/etc/kubernetes/pki/ca.pem \
  --cert=/etc/kubernetes/pki/etcd.pem \
  --key=/etc/kubernetes/pki/etcd-key.pem \
  --endpoints=${ETCD_ENDPOINTS} endpoint status

2、leader节点执行下面操作

mkdir -p /data/etcd_backup_dir
etcdctl \
  --cacert=/etc/kubernetes/pki/ca.pem \
  --cert=/etc/kubernetes/pki/etcd.pem \
  --key=/etc/kubernetes/pki/etcd-key.pem \
  --endpoints=https://192.168.55.32:2379 \
  snapshot save /data/etcd_backup_dir/etcd-snapshot-`date +%Y%m%d`.db

三、ETCD数据恢复

1、准备工作

  • 停止所有 Master 上 kube-apiserver 服务
# 停止kube-apiserver
systemctl stop kube-apiserver  
 
# 确认 kube-apiserver 服务是否停止 
ps -ef | grep kube-apiserver
  • 停止集群中所有 ETCD 服务
for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    ssh root@${node_ip} "systemctl stop etcd"
  done
  • 移除所有 ETCD 存储目录下数据以及wal日志目录
  • 根据自己的集群ETCD服务中配置的存储路径进行修改
mkdir -p /data/etcd_backup_dir
mv /data/var/lib/etcd/ /data/etcd_backup_dir/etcd_data_`date +%Y%m%d%H%M%S`
  • 拷贝 ETCD 备份快照
# 从 ETCD集群的leader节点机器上拷贝备份到另外2台ETCD节点上
scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.33:/data/etcd_backup_dir/ 
 
scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.34:/data/etcd_backup_dir/

2、恢复备份

  • 定义ETCD_NODES(在三台MASTER节点上分别执行)
#ETCD集群间通信的IP和端口;注意此处改为自己的实际ETCD服务配置的name
export ETCD_NODES="etcd0=https://192.168.55.32:2380,etcd1=https://192.168.55.33:2380,etcd2=https://192.168.55.34:2380"


export ETCD_NODE1=192.168.55.32
export ETCD_NODE2=192.168.55.33
export ETCD_NODE3=192.168.55.34

  • master1恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.32机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd0 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

  • master2恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.33机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd1 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

  • master3恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.34机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd2 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

3、启动ETCD

三台 ETCD 都恢复完成后,启动 ETCD

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    ssh root@${node_ip} "chmod 0700 /data/var/lib/etcd"
    ssh root@${node_ip} "systemctl daemon-reload && systemctl enable etcd && systemctl restart etcd"
  done

4、检查ETCD

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    etcdctl \
    --endpoints=https://${node_ip}:2379 \
    --cacert=/etc/kubernetes/pki/ca.pem \
    --cert=/etc/kubernetes/pki/etcd.pem \
    --key=/etc/kubernetes/pki/etcd-key.pem endpoint health
  done

5、启动Apiserver

  • 三台 ETCD 全部健康, 启动 kube-apiserver
for node_ip in ${ETCD_IPS[@]}
  do
    echo ">>> ${node_ip}"
    ssh root@${node_ip} "systemctl daemon-reload && systemctl enable kube-apiserver && systemctl restart kube-apiserver"
  done

6、检查集群

kubectl get cs

CronJob

apiVersion: batch/v1beta1
kind: CronJob
metadata:
  name: etcd-bak
spec:
 schedule: "0 */3 * * *"
 successfulJobsHistoryLimit: 3
 failedJobsHistoryLimit: 5
 #activeDeadlineSenconds: 60
 jobTemplate:
  spec:
    template:
      metadata:
       labels:
        app: etcd-bak
      spec:
        affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
                  nodeSelectorTerms:
                  - matchExpressions:
                    - key: node-role.kubernetes.io/master
                      operator: In
                      values:
                      - ""
        containers:
        - name: etcd
          image: registry.aliyuncs.com/google_containers/etcd:3.5.1-0
          command:
          - sh
          - -c
          - "for ETCD_IP in $ETCD_IPS;
            do
             export ETCDCTL_API=3; \
             etcdctl --cert=/etc/kubernetes/pki/etcd/server.crt \
             --cacert=/etc/kubernetes/pki/etcd/ca.crt \
             --key=/etc/kubernetes/pki/etcd/server.key \
             --endpoints $ETCD_IP:2379 snapshot save /tmp/etcd-snapshot-$ETCD_IP-$(printf '%(%Y%m%d-%H%M%S)T').db; \
             echo etcd backup sucess;
            done"
          env:
          - name: ETCD_IPS
            value: "10.0.16.14"
          volumeMounts:
            - mountPath: "/tmp"
              name: snapshot
        #      subPath: data/etcd-snapshot
            - mountPath: /etc/localtime
              name: lt-config
            - mountPath: /etc/kubernetes/pki/etcd
              name: etcd-secret
        restartPolicy: OnFailure
        volumes:
          - name: snapshot
            hostPath:
              path: /data/etcd_backup_dir
          - name: lt-config
            hostPath:
              path: /etc/localtime
          - name: etcd-secret
            hostPath:
              path: /etc/kubernetes/pki/etcd
        hostNetwork: true

三、创建CronJob

修改ETCD_IPS为主节点的IP如"10.0.16.14 10.0.16.15 10.0.16.16"

创建

kubectl apply -f etcd-bak.yaml

执行完成后,会创建一个定时任务,这个定时任务会每隔3小时在主节点的/data/etcd_backup_dir目录下生成文件名etcd-snapshot-节点名-时间.db的备份文件

四、恢复ETCD数据

在etcd故障时,挑选最后备份的数据进行恢复

1、准备工作

  • 停止所有 Master 上 kube-apiserver 服务
# 停止kube-apiserver
systemctl stop kube-apiserver  
 
# 确认 kube-apiserver 服务是否停止 
ps -ef | grep kube-apiserver
  • 停止集群中所有 ETCD 服务
for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    ssh root@${node_ip} "systemctl stop etcd"
  done
  • 移除所有 ETCD 存储目录下数据以及wal日志目录
  • 根据自己的集群ETCD服务中配置的存储路径进行修改
mkdir -p /data/etcd_backup_dir
mv /data/var/lib/etcd/ /data/etcd_backup_dir/etcd_data_`date +%Y%m%d%H%M%S`
  • 拷贝 ETCD 备份快照
# 从 ETCD集群的leader节点机器上拷贝备份到另外2台ETCD节点上
scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.33:/data/etcd_backup_dir/ 
 
scp /data/etcd_backup_dir/etcd-snapshot-20210818.db root@192.168.55.34:/data/etcd_backup_dir/

2、恢复备份

  • 定义ETCD_NODES(在三台MASTER节点上分别执行)
#ETCD集群间通信的IP和端口;注意此处改为自己的实际ETCD服务配置的name
export ETCD_NODES="etcd0=https://192.168.55.32:2380,etcd1=https://192.168.55.33:2380,etcd2=https://192.168.55.34:2380"


export ETCD_NODE1=192.168.55.32
export ETCD_NODE2=192.168.55.33
export ETCD_NODE3=192.168.55.34

  • master1恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.32机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd0 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

  • master2恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.33机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd1 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

  • master3恢复(data-dir和wal-dir参考ETCD服务中的配置)
# 192.168.55.34机器上操作
ETCDCTL_API=3 etcdctl snapshot restore /data/etcd_backup_dir/etcd-snapshot-20210818.db \
  --name etcd2 \
  --initial-cluster "${ETCD_NODES}" \
  --initial-cluster-token etcd-k8s-cluster \
  --initial-advertise-peer-urls https://${ETCD_NODE1}:2380 \
  --data-dir=/data/var/lib/etcd --wal-dir=/data/var/lib/etcd/wal

3、启动ETCD

三台 ETCD 都恢复完成后,启动 ETCD

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    ssh root@${node_ip} "chmod 0700 /data/var/lib/etcd"
    ssh root@${node_ip} "systemctl daemon-reload && systemctl enable etcd && systemctl restart etcd"
  done

4、检查ETCD

for node_ip in ${ETCD_IPS[@]}
  do
    echo ">> ${node_ip}"
    etcdctl \
    --endpoints=https://${node_ip}:2379 \
    --cacert=/etc/kubernetes/pki/ca.pem \
    --cert=/etc/kubernetes/pki/etcd.pem \
    --key=/etc/kubernetes/pki/etcd-key.pem endpoint health
  done

5、启动Apiserver

  • 三台 ETCD 全部健康, 启动 kube-apiserver
for node_ip in ${ETCD_IPS[@]}
  do
    echo ">>> ${node_ip}"
    ssh root@${node_ip} "systemctl daemon-reload && systemctl enable kube-apiserver && systemctl restart kube-apiserver"
  done

6、检查集群

kubectl get cs

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值