混沌工程文档: https://chaosblade.io/docs/
helm: https://github.com/helm/helm/releases
chaosblade: https://github.com/chaosblade-io/chaosblade/releases
chaosblade-box: https://github.com/chaosblade-io/chaosblade-box/releases
metrics-server: https://github.com/kubernetes-sigs/metrics-server/releases
ChaosBlade 是什么?
ChaosBlade是一个云原生混沌工程平台,支持多种环境、集群和语言。
包含混沌工程实验工具 chaosblade 和混沌工程平台 chaosblade-box,旨在通过混沌工程帮助企业解决云原生过程中高可用问题。
一、安装部署K8S
1.1 修改服务器名称
[root@VM-8-11-centos ~]# hostnamectl set-hostname master
[root@VM-16-10-centos ~]# hostnamectl set-hostname no1
vim /etc/hosts
# 内容BEGIN
10.0.8.11 master
10.0.16.10 no1
# 内容END
reboot
1.2 关闭防火墙和启用时间同步等
# 关闭防火墙(在3台master运行)
systemctl stop firewalld && systemctl disable firewalld
# 时间同步(在3台master运行)
yum install ntpdate -y && ntpdate time.windows.com
# 关闭swap(在3台master运行)
swapoff -a && sed -ri 's/.*swap.*/#&/' /etc/fstab
# 关闭selinux(在3台master运行)
# sed -i 's/enforcing/disabled/' /etc/selinux/config && setenforce 0
setenforce 0
# 安装必要的系统工具
sudo yum install -y yum-utils device-mapper-persistent-data lvm2
1.3 安装Docker20.10.24
所有master+node+rancher执行
# 1 切换镜像源
[root@master ~]# wget https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo -O /etc/yum.repos.d/docker-ce.repo
# 2 查看当前镜像源中支持的docker版本
[root@master ~]# yum list docker-ce --showduplicates
# 3 安装特定版本的docker-ce
# 必须指定--setopt=obsoletes=0,否则yum会自动安装更高版本
[root@master ~]# yum install --setopt=obsoletes=0 docker-ce-18.06.3.ce-3.el7 -y
或(推荐) yum install -y --setopt=obsoletes=0 docker-ce-20.10.24-3.el8 docker-ce-cli-20.10.24-3.el8
# 4 配置文件
vim /etc/systemd/system/docker.service
# 内容BEGIN
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd
ExecReload=/bin/kill -s HUP $MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target
# 内容END
chmod +x /etc/systemd/system/docker.service
# 5 启动docker
# 重新加载配置文件
systemctl daemon-reload
# 启动docker
systemctl start docker
# 设置开机启动
systemctl enable docker.service
# 6 检查docker状态和版本
[root@master ~]# docker version
# 可以通过修改daemon配置文件/etc/docker/daemon.json来使用加速器
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json <<-'EOF'
{
"exec-opts": ["native.cgroupdriver=systemd"],
"registry-mirrors": [
"https://mirror.ccs.tencentyun.com"
]
}
EOF
sudo systemctl daemon-reload && sudo systemctl restart docker
# 7 配置ipv4
# 1 安装ipset和ipvsadm
[root@master ~]# yum install ipset ipvsadmin -y
# 如果提示No package ipvsadmin available.需要使用 yum install -y ipvsadm
# 2 添加需要加载的模块写入脚本文件
[root@master ~]# cat <<EOF > /etc/sysconfig/modules/ipvs.modules
#!/bin/bash
modprobe -- ip_vs
modprobe -- ip_vs_rr
modprobe -- ip_vs_wrr
modprobe -- ip_vs_sh
modprobe -- nf_conntrack
EOF
# 3 为脚本文件添加执行权限
[root@master ~]# chmod +x /etc/sysconfig/modules/ipvs.modules
# 4 执行脚本文件
[root@master ~]# /bin/bash /etc/sysconfig/modules/ipvs.modules
# 5 查看对应的模块是否加载成功
[root@master ~]# lsmod | grep -e ip_vs -e nf_conntrack_ipv4
1.4 部署K8S集群1.20.11
网络配置 iptables
vim /etc/sysctl.d/k8s.conf
# 内容BEGIN
net.bridge.bridge-nf-call-iptables=1
net.bridge.bridge-nf-call-ip6tables=1
net.ipv4.ip_forward=1
vm.swappiness=0
# 内容END
modprobe br_netfilter
sysctl -p /etc/sysctl.d/k8s.conf
配置kubernetes源
vim /etc/yum.repos.d/kubernetes.repo
# 内容BEGIN
[kubernetes]
name=Kubernetes
baseurl=http://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=0
repo_gpgcheck=0
gpgkey=http://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg
http://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
# 内容END
# 清除缓存
yum clean all
# 把服务器的包信息下载到本地电脑缓存起来,makecache建立一个缓存
yum makecache
安装kubelet kubeadm kubectl
# 当不指定具体版本时,将下载当前yum源对应的最新版本
yum install -y kubelet kubeadm kubectl
# 也可以指定版本,建议指定如下版本, 安装指定版本的kubelet,kubeadm,kubectl
yum install -y kubelet-1.20.11 kubeadm-1.20.11 kubectl-1.20.11
安装完成后,查看K8S版本
# 查看kubelet版本
[root@master ~]# kubelet --version
Kubernetes v1.20.11
# 查看kubeadm版本
[root@master ~]# kubeadm version
kubeadm version: &version.Info{Major:"1", Minor:"20", GitVersion:"v1.20.11", GitCommit:"27522a29febbcc4badac257763044d0d90c11abd", GitTreeState:"clean", BuildDate:"2021-09-15T19:20:34Z", GoVersion:"go1.15.15", Compiler:"gc", Platform:"linux/amd64"}
# 配置kubelet的cgroup
# 编辑/etc/sysconfig/kubelet,添加下面的配置
KUBELET_CGROUP_ARGS="--cgroup-driver=systemd"
KUBE_PROXY_MODE="ipvs"
# 4 设置kubelet开机自启
[root@master ~]# systemctl daemon-reload
[root@master ~]# systemctl start kubelet
[root@master ~]# systemctl status kubelet
# 没启动成功,报错先不管,后面的 `kubeadm init` 会拉起
[root@master ~]# systemctl enable kubelet
[root@master ~]# systemctl is-enabled kubelet
# 下载 cni-plugins-linux-amd64-v1.2.0.tgz 插件
$ wget https://github.com/containernetworking/plugins/releases/download/v1.2.0/cni-plugins-linux-amd64-v1.2.0.tgz
# 解压 cni-plugins-linux-arm64-v1.2.0.tgz 插件 并且复制到 /opt/cni/bin/
tar -zxvf cni-plugins-linux-amd64-v1.2.0.tgz -C /opt/cni/bin/
准备集群镜像
# 在安装kubernetes集群之前,必须要提前准备好集群需要的镜像,所需镜像可以通过下面命令查看
[root@master ~]# kubeadm config images list
# 下载镜像
# 此镜像在kubernetes的仓库中,由于网络原因,无法连接,下面提供了一种替代方案
images=(
kube-apiserver:v1.20.11
kube-controller-manager:v1.20.11
kube-scheduler:v1.20.11
kube-proxy:v1.20.11
pause:3.2
etcd:3.4.13-0
coredns:1.7.0
)
for imageName in ${images[@]} ; do
docker pull registry.cn-hangzhou.aliyuncs.com/google_containers/$imageName
docker tag registry.cn-hangzhou.aliyuncs.com/google_containers/$imageName k8s.gcr.io/$imageName
docker rmi registry.cn-hangzhou.aliyuncs.com/google_containers/$imageName
done
初始化K8S集群Master
注意,此操作只在规划的K8S Master的MASTER服务器(即本篇master)上执行
# 执行初始化命令 初始化k8s集群
[root@master ~]# kubeadm init \
--kubernetes-version=v1.20.11 \
--pod-network-cidr=10.244.0.0/16 \
--service-cidr=10.96.0.0/12 \
--apiserver-advertise-address=10.0.8.11 \
--ignore-preflight-errors=all
kube-flannel.yml
---
kind: Namespace
apiVersion: v1
metadata:
name: kube-flannel
labels:
k8s-app: flannel
pod-security.kubernetes.io/enforce: privileged
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- apiGroups:
- networking.k8s.io
resources:
- clustercidrs
verbs:
- list
- watch
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: flannel
subjects:
- kind: ServiceAccount
name: flannel
namespace: kube-flannel
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: flannel
name: flannel
namespace: kube-flannel
---
kind: ConfigMap
apiVersion: v1
metadata:
name: kube-flannel-cfg
namespace: kube-flannel
labels:
tier: node
k8s-app: flannel
app: flannel
data:
cni-conf.json: |
{
"name": "cbr0",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "flannel",
"delegate": {
"hairpinMode": true,
"isDefaultGateway": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
net-conf.json: |
{
"Network": "10.244.0.0/16",
"Backend": {
"Type": "vxlan"
}
}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kube-flannel-ds
namespace: kube-flannel
labels:
tier: node
app: flannel
k8s-app: flannel
spec:
selector:
matchLabels:
app: flannel
template:
metadata:
labels:
tier: node
app: flannel
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
hostNetwork: true
priorityClassName: system-node-critical
tolerations:
- operator: Exists
effect: NoSchedule
serviceAccountName: flannel
initContainers:
- name: install-cni-plugin
image: docker.io/flannel/flannel-cni-plugin:v1.2.0
command:
- cp
args:
- -f
- /flannel
- /opt/cni/bin/flannel
volumeMounts:
- name: cni-plugin
mountPath: /opt/cni/bin
- name: install-cni
image: docker.io/flannel/flannel:v0.24.0
command:
- cp
args:
- -f
- /etc/kube-flannel/cni-conf.json
- /etc/cni/net.d/10-flannel.conflist
volumeMounts:
- name: cni
mountPath: /etc/cni/net.d
- name: flannel-cfg
mountPath: /etc/kube-flannel/
containers:
- name: kube-flannel
image: docker.io/flannel/flannel:v0.24.0
command:
- /opt/bin/flanneld
args:
- --ip-masq
- --kube-subnet-mgr
resources:
requests:
cpu: "100m"
memory: "50Mi"
securityContext:
privileged: false
capabilities:
add: ["NET_ADMIN", "NET_RAW"]
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: EVENT_QUEUE_DEPTH
value: "5000"
volumeMounts:
- name: run
mountPath: /run/flannel
- name: flannel-cfg
mountPath: /etc/kube-flannel/
- name: xtables-lock
mountPath: /run/xtables.lock
volumes:
- name: run
hostPath:
path: /run/flannel
- name: cni-plugin
hostPath:
path: /opt/cni/bin
- name: cni
hostPath:
path: /etc/cni/net.d
- name: flannel-cfg
configMap:
name: kube-flannel-cfg
- name: xtables-lock
hostPath:
path: /run/xtables.lock
type: FileOrCreate
[root@master ~]# kubectl apply -f kube-flannel.yml
namespace/kube-flannel created
clusterrole.rbac.authorization.k8s.io/flannel created
clusterrolebinding.rbac.authorization.k8s.io/flannel created
serviceaccount/flannel created
configmap/kube-flannel-cfg created
daemonset.apps/kube-flannel-ds created
[root@master ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master Ready control-plane,master 7m59s v1.20.11
no1 Ready <none> 6m26s v1.20.11
主机无法使用kubectl get nodes的解决办法
# 复制admin.conf,请在master节点服务器上执行此命令:
scp /etc/kubernetes/admin.conf root@10.0.16.10:/etc/kubernetes/admin.conf
# 然后在该工作节点上配置环境变量:
# 设置kubeconfig文件
export KUBECONFIG=/etc/kubernetes/admin.conf
echo "export KUBECONFIG=/etc/kubernetes/admin.conf" >> ~/.bash_profile
# 接下来,工作节点执行kubectl就正常了
1.5 去除master污点
kubectl describe node master | grep -A 1 "Taints"
kubectl taint nodes master node-role.kubernetes.io/master:NoSchedule-
kubectl taint nodes master node.kubernetes.io/unreachable:NoExecute-
kubectl taint nodes master node.kubernetes.io/unreachable:NoSchedule-
systemctl restart kubelet
1.6 安装Metrics Server 0.5.2
wget https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.5.2/components.yaml
vim components.yaml
# 内容修改BEGIN
- --kubelet-insecure-tls
image: registry.aliyuncs.com/google_containers/metrics-server:v0.5.2
serviceAccountName: metrics-server
hostNetwork: true
# 内容修改END
kubectl apply -f components.yaml
# 检查
kubectl get po -n kube-system
kubectl get apiservices
# 1、安装Metrics Server(或上述步骤安装)
[root@master ~]# vim metrics-server.yml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
name: system:aggregated-metrics-reader
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
name: system:metrics-server
rules:
- apiGroups:
- ""
resources:
- pods
- nodes
- nodes/stats
- namespaces
- configmaps
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server-auth-reader
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: system:metrics-server
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:metrics-server
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
ports:
- name: https
port: 443
protocol: TCP
targetPort: https
selector:
k8s-app: metrics-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
selector:
matchLabels:
k8s-app: metrics-server
strategy:
rollingUpdate:
maxUnavailable: 0
template:
metadata:
labels:
k8s-app: metrics-server
spec:
containers:
- args:
- --cert-dir=/tmp
- --secure-port=4443
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
- --kubelet-use-node-status-port
- --metric-resolution=15s
- --kubelet-insecure-tls
image: registry.aliyuncs.com/google_containers/metrics-server:v0.5.2
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /livez
port: https
scheme: HTTPS
periodSeconds: 10
name: metrics-server
ports:
- containerPort: 4443
name: https
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: https
scheme: HTTPS
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests:
cpu: 100m
memory: 200Mi
securityContext:
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
volumeMounts:
- mountPath: /tmp
name: tmp-dir
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
serviceAccountName: metrics-server
hostNetwork: true
volumes:
- emptyDir: {}
name: tmp-dir
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
k8s-app: metrics-server
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: kube-system
version: v1beta1
versionPriority: 100
[root@master ~]# kubectl apply -f metrics-server.yml
# 2、查看metrics server pod是否运行正常
[root@master ~]# kubectl get pods -n=kube-system |grep metrics
metrics-server-5fc47c948-552xh 1/1 Running 0 7h2m
# 3、使用kubectl top 命令查看pod的cpu ,内存占比
[root@master ~]# kubectl top node
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
master 168m 8% 2366Mi 66%
no1 64m 3% 788Mi 48%
1.7 安装k8s-dashboard:kubepi
vim kubepi.yml
apiVersion: v1
kind: ServiceAccount
metadata:
name: kubepi-user
namespace: kube-system
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: kubepi-user
annotations:
rbac.authorization.kubernetes.io/autoupdate: "true"
roleRef:
kind: ClusterRole
name: cluster-admin
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: kubepi-user
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kubepi-user
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: v1
kind: Secret
metadata:
name: kubepi-user
namespace: kube-system
annotations:
kubernetes.io/service-account.name: "kubepi-user"
type: kubernetes.io/service-account-token
---
apiVersion: v1
kind: Service
metadata:
name: kubepi
namespace: kube-system
spec:
type: NodePort
ports:
- name: http
port: 80
targetPort: 80
protocol: TCP
selector:
app.kubernetes.io/name: kubepi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kubepi
namespace: kube-system
labels:
app.kubernetes.io/name: kubepi
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kubepi
template:
metadata:
labels:
app.kubernetes.io/name: kubepi
spec:
containers:
- name: kubepi
image: kubeoperator/kubepi-server:latest
imagePullPolicy: Always
ports:
- containerPort: 80
protocol: TCP
securityContext:
privileged: true
kubectl apply -f kubepi.yml
# 用户名: admin
# 密码: kubepi
kubectl get po,svc -n kube-system -o wide
# 获取 apiserver 地址
[root@master ~]# cat ~/.kube/config | grep server: | awk '{print $2}'
# 获取token
[root@master ~]# kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep kubepi-user | awk '{print $1}') | grep token: | awk '{print $2}'
二、安装部署chaosblade
2.1 安装helm 3.6.2
# 1、解压安装包
tar -zxvf helm-v3.6.2-linux-amd64.tar.gz
# 2、复制指令
cp linux-amd64/helm /usr/local/bin
# 3、验证是否安装成功
helm version
2.2 安装chaosbalde-operator 1.7.3
# 1、为chaosblade创建一个 namespace
[root@master ~]# kubectl create namespace chaosblade
namespace/chaosblade created
# 2、安装 ChaosBlade-Operator
[root@master ~]# helm repo add chaosblade-io https://chaosblade-io.github.io/charts
[root@master ~]# helm install chaosblade chaosblade-io/chaosblade-operator --namespace chaosblade --set webhook.enable=true // --set webhook.enable=true 是为了 Pod 文件系统 I/O 故障实验
# 3、查看安装结果
[root@master ~]# helm ls --all-namespaces
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
chaosblade chaosblade 1 2024-08-24 08:10:13.394948721 +0800 CST deployed chaosblade-operator-1.7.3 1.7.3
[root@master ~]# kubectl get pod -n chaosblade
NAME READY STATUS RESTARTS AGE
chaosblade-operator-6cb795dbc4-csj5h 1/1 Running 0 6h57m
chaosblade-tool-9jxht 1/1 Running 0 6h57m
chaosblade-tool-dr9g9 1/1 Running 0 6h57m
2.3 安装chaosblade-box 1.7.3
[root@master ~]# helm install chaosblade-box chaosblade-box-1.0.3.tgz --namespace chaosblade --set spring.datasource.password=Wfh1998#2024
NAME: chaosblade-box
LAST DEPLOYED: Sat Aug 24 08:11:24 2024
NAMESPACE: chaosblade
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
Thank you for using chaosblade-box.
[root@master ~]# kubectl get po,svc -n chaosblade -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/chaosblade-box-547d4447ff-f6dlj 1/1 Running 0 6h52m 10.244.0.9 master <none> <none>
pod/chaosblade-box-mysql-59ccc86cc6-ghdc9 1/1 Running 0 6h57m 10.244.0.7 master <none> <none>
pod/chaosblade-operator-6cb795dbc4-csj5h 1/1 Running 0 6h58m 10.244.0.6 master <none> <none>
pod/chaosblade-tool-9jxht 1/1 Running 0 6h58m 10.0.16.10 no1 <none> <none>
pod/chaosblade-tool-dr9g9 1/1 Running 0 6h58m 10.0.8.11 master <none> <none>
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
service/chaosblade-box LoadBalancer 10.106.177.130 <pending> 7001:31156/TCP 6h57m app=chaosblade-box
service/chaosblade-box-mysql ClusterIP 10.103.123.204 <none> 3306/TCP 6h57m app=chaosblade-box-mysql
service/chaosblade-webhook-server ClusterIP 10.101.132.197 <none> 443/TCP 6h58m name=chaosblade-operator
2.4 注册并登录chaosblade
注册登录账密: admin/admin
2.5 安装探针agent
# 获取集群id和集群名称的方法
[root@master ~]# kubectl config current-context
kubernetes-admin@kubernetes
[root@master ~]# kubectl cluster-info
Kubernetes control plane is running at https://10.0.8.11:6443
KubeDNS is running at https://10.0.8.11:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.
# 下载chaos.tgz
wget https://chaosblade.oss-cn-hangzhou.aliyuncs.com/platform/release/1.0.2/chaosblade-box-agent-1.0.2.tgz -O chaos.tgz
# 安装
# helm install agent chaos.tgz --namespace chaosblade --set env.name=default,license=2fbb7b1084f44d30b83a062921ac3e69,images.chaos.repository=chaosbladeio/chaosblade-agent,images.chaos.version=1.0.2,transport.endpoint={替换为box的 ip:port},controller.cluster_id={替换为集群id,取值无特殊要求},controller.cluster_name={替换为集群名字,取值无特殊要求}
# 示例
[root@master ~]# helm install agent chaos.tgz --namespace chaosblade --set env.name=default,license=2fbb7b1084f44d30b83a062921ac3e69,images.chaos.repository=chaosbladeio/chaosblade-agent,images.chaos.version=1.0.2,transport.endpoint=10.106.177.130:7001,controller.cluster_id=kubernetes,controller.cluster_name=kubernetes-admin@kubernetes
NAME: agent
LAST DEPLOYED: Sat Aug 24 16:07:16 2024
NAMESPACE: chaosblade
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
Thank you for using the agent service, you can check the service installation status by the following command:
kubectl get pods -n chaosblade.
探针安装完成如下图
三、模拟实验
3.1 安装测试demo
[root@master ~]# unzip guestbook.zip
Archive: guestbook.zip
creating: guestbook/
inflating: guestbook/guestbook-deployment.yaml
inflating: guestbook/guestbook-service.yaml
inflating: guestbook/loss_pod_network_by_names.yaml
inflating: guestbook/redis-master-deployment.yaml
inflating: guestbook/redis-master-service.yaml
inflating: guestbook/redis-slave-deployment.yaml
inflating: guestbook/redis-slave-service.yaml
[root@master ~]# kubectl apply -f guestbook/guestbook-deployment.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/guestbook-service.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/loss_pod_network_by_names.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/redis-master-deployment.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/redis-master-service.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/redis-slave-deployment.yaml -n chaosblade
[root@master ~]# kubectl apply -f guestbook/redis-slave-service.yaml -n chaosblade
[root@master ~]# kubectl get po -n chaosblade
NAME READY STATUS RESTARTS AGE
chaos-agent-77c665cd54-jhdmd 1/1 Running 1 22m
chaosblade-box-547d4447ff-5rgf6 1/1 Running 1 30m
chaosblade-box-mysql-59ccc86cc6-ghdc9 1/1 Running 1 7h31m
chaosblade-operator-6cb795dbc4-csj5h 1/1 Running 1 7h32m
chaosblade-tool-9jxht 1/1 Running 1 7h32m
chaosblade-tool-dr9g9 1/1 Running 1 7h32m
guestbook-v1-7676687667-2d5lw 1/1 Running 0 30s
guestbook-v1-7676687667-ws4bq 1/1 Running 0 30s
guestbook-v1-7676687667-xbtgl 1/1 Running 0 30s
redis-master-7bf5f6d487-5s26z 1/1 Running 0 19s
redis-slave-58db457857-4c726 1/1 Running 0 11s
redis-slave-58db457857-kq76j 1/1 Running 0 11s
3.2 容器内CPU满载
-
选择演练场景:Kubernetes → 系统资源 → CPU资源 → 容器内Cpu满载,点击创建演练
-
在页面设置演练名称、分组名称、应用、应用分组和机器列表等
-
演练内容将自动添加,点击配置相关参数
# 获取相关配置参数的方法 [root@master ~]# kubectl describe pod guestbook-v1-7676687667-ws4bq -n chaosblade Name: guestbook-v1-7676687667-ws4bq Namespace: chaosblade Priority: 0 Node: master/10.0.8.11 Start Time: Sat, 24 Aug 2024 15:42:11 +0800 Labels: app=guestbook pod-template-hash=7676687667 version=1.0 Annotations: <none> Status: Running IP: 10.244.0.21 IPs: IP: 10.244.0.21 Controlled By: ReplicaSet/guestbook-v1-7676687667 Containers: guestbook: Container ID: docker://d1c8a9ee9a6c413dda44bdf1c560edc69fe3afb582b3fed8f276c70b17cb5506 // 填写containerd://后的内容 ...... # 获取容器索引 # Pod代表一组容器,容器可以是一个或多个 # 可以看到在指定的Pod guestbook-v1-5457d6dd58-q74sc中只有一个容器,并且容器的名称是 guestbook。由于只有一个容器,因此其索引为0。 [root@master ~]# kubectl get pod guestbook-v1-7676687667-ws4bq -n chaosblade -o jsonpath='{.spec.containers[*].name}' guestbook
如下图
-
在全局配置页面完成以下配置
-
点击下一步、创建演练
-
点击演练详情,跳转到演练页面,点击执行演练
-
待演练成功后,查看容器内cpu状态
[root@master ~]# kubectl exec -it guestbook-v1-7676687667-ws4bq -n chaosblade -c guestbook top # 可以看到容器内CPU负载达到80%
-
待演练结束后(手动结束或自动结束),再查看容器cpu状态
可以看到CPU负载已恢复到正常
3.3 容器内内存负载
namespace、container-ids、container-index、container-names、labels、names
no change, just setmem-percent=60、mode=ram
(下文无特殊情况不在赘述,只说明增改配置)
- 选择演练场景:Kubernetes → 系统资源 → 内存资源 → 容器内内存负载,点击创建演练
- 创建演练成功后执行;执行成功后查看容器内内存情况,可以看到内存负载接近60%
- 待演练结束后,再查看容器内内存情况,可以看到内存已经恢复到正常
3.4 网络资源
3.4.1 容器内网络丢包
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络丢包,点击创建演练
-
配置
interface=eth0、percent=50、destination-ip=10.244.0.22
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到在ping目标ip时,丢包率在50%左右[root@master ~]# kubectl exec -it guestbook-v1-7676687667-ws4bq -n chaosblade -c guestbook sh
演练结束再次查看,已恢复正常
3.4.2 容器内网络占用
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络占用,点击创建演练
-
配置
port=8888
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到8888端口被占用
[root@master ~]# kubectl exec -it guestbook-v1-7676687667-ws4bq -n chaosblade -c guestbook sh netstat -tunl | grep 8888
待演练结束后,再查看容器内网络情况,可以看到端口已被释放
3.4.3 容器内网络DNS异常
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络Dns异常,点击创建演练
-
配置
domain=www.baidu.com、ip=10.96.0.10(通过nslookup www.baidu.com获取)
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到网络DNS异常
待演练结束后,再查看容器内网络情况,可以看到DNS恢复正常
3.4.4 容器内网络乱序
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络乱序,点击创建演练
-
配置
correlation=10、interface=eth0、percent=30、destination-ip=10.244.0.22、time=100
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到网络存在乱序现象,往返时间达到设定的100ms
待演练结束后,再查看容器内网络情况,可以看到网络恢复正常
3.4.5 容器内网络损坏
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络损坏,点击创建演练
-
配置
interface=eth0、percent=100、destination-ip=10.244.0.22
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到网络损坏
待演练结束后,再查看容器内网络情况,可以看到网络恢复正常
3.4.6 容器内网络延迟
-
选择演练场景:Kubernetes → 系统资源 → 网络资源 → 容器内网络延迟,点击创建演练
-
配置
interface=eth0、time=100、destination-ip=10.244.0.22
创建演练成功后执行;执行成功后查看容器内网络情况,可以看到网络延迟达到设定的100ms
待演练结束后,再查看容器内网络情况,可以看到网络恢复正常
3.5 磁盘资源
3.5.1 容器内文件移动
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内文件移动,点击创建演练
-
配置
filepath=/app/public/index.html、target=/app/
创建演练成功后执行;执行成功后查看容器内演练结果
待演练结束后,再查看容器内文件,已恢复到原有位置
3.5.2 容器内文件新增
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内文件新增,点击创建演练
-
配置
filepath=/app/test.txt、content=appendContent
创建演练成功后执行;执行成功后查看容器内演练结果,可以看到新增的文件
待演练结束后,再查看容器,文件已删除
3.5.3 容器内文件删除
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内文件删除,点击创建演练
-
配置
filepath=/app/public/index.html
创建演练成功后执行;执行成功后查看容器内演练结果,可以看到指定的文件被删除
待演练结束后,再查看容器,文件已恢复
3.5.4 容器内文件追加
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内文件追加,点击创建演练
-
配置
filepath=/app/test.txt;content=xxxXXX
创建演练成功后执行;执行成功后查看容器内演练结果
容器内文件追加:已经追加的内容不支持恢复
3.5.5 容器内更改文件属性
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 更改文件属性,点击创建演练
-
配置
filepath=/app/public/index.html;mark=777
创建演练成功后执行;执行成功后查看容器内文件属性,可以看到文件的权限被改为了777
待演练结束后,再查看容器,文件权限已恢复
3.5.6 容器内磁盘负载提升
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内负载提升,点击创建演练
-
配置
path=/、size=50、write=50
创建演练成功后执行;执行成功后查看容器内磁盘负载重复着不断上升
待演练结束后,再查看容器磁盘,已恢复正常
3.5.7 容器内磁盘填充
-
选择演练场景:Kubernetes → 系统资源 → 磁盘资源 → 容器内磁盘填充,点击创建演练
-
配置
path=/、percent=80
-
guestbook容器内dd指令版本问题,导致演练失败。该演练使用redis-slave容器
-
kubectl exec -it redis-slave-58db457857-kq76j -n chaosblade -c redis-slave sh
创建演练成功后执行;执行成功后查看容器内磁盘负载达到80%
待演练结束后,再查看容器磁盘,已恢复正常
3.6 应用进程
3.6.1 容器内进程停止
-
选择演练场景:Kubernetes → 系统资源 → 应用进程 → 容器内进程停止,点击创建演练
-
配置
process=top
创建演练成功后执行;执行成功后查看容器内top进程被停止
3.6.2 容器内进程杀死
-
选择演练场景:Kubernetes → 系统资源 → 应用进程 → 容器内进程杀死,点击创建演练
-
配置
process=top、signal=9
创建演练成功后执行;执行成功后查看容器内top进程被杀死
四、Java操作K8S
4.1 K8S工具类
public class K8sUtil {
public static ApiClient getApiClient() throws IOException {
InputStream kubeConfigIs = K8sUtil.class.getClassLoader()
.getResourceAsStream("kube-config.yaml");
assert kubeConfigIs != null;
ByteArrayInputStream kubeConfig = new ByteArrayInputStream(IOUtils.toByteArray(kubeConfigIs));
return Config.fromConfig(kubeConfig);
}
public static V1PodList getPods(ApiClient apiClient, String namespace) throws ApiException {
// 报连接错误请看 https://www.cnblogs.com/leozhanggg/p/16243205.html
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
return coreV1Api.listNamespacedPod(namespace, null, null, null, null, null, null, null, null, 300, null);
}
public static V1PodList getPodsByLabel(ApiClient apiClient, String namespace, String label) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
return coreV1Api.listNamespacedPod(namespace, null, null, null, null, label, null, null, null, 300, null);
}
public static V1NodeList getNodes(ApiClient apiClient) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
return coreV1Api.listNode(null, null, null, null, null, null, null, null, 300, null);
}
public static V1NodeList getNodesByLabel(ApiClient apiClient, String label) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
return coreV1Api.listNode(null, null, null, label, null, null, null, null, 300, null);
}
public static V1NamespaceList getNamespaces(ApiClient apiClient) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
return coreV1Api.listNamespace(null, null, null, null, null, null, null, null, 300, null);
}
public static List<Map<String, String>> getOperationToolsLogs(ApiClient apiClient, Integer lines, String namespace, String podName) throws ApiException {
List<Map<String, String>> resList = new CopyOnWriteArrayList<>();
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, null, null, null, null, null, null, null, null, 50, null);
if (Objects.isNull(v1PodList)) return resList;
List<V1Pod> list = v1PodList.getItems().stream().filter(item -> "Running".equalsIgnoreCase(Objects.requireNonNull(item.getStatus()).getPhase())).filter(item -> item.getMetadata().getName().contains(podName)).collect(Collectors.toList());
if (CollectionUtils.isEmpty(list)) return resList;
for (V1Pod v1Pod : list) {
String image = Objects.requireNonNull(v1Pod.getStatus()).getContainerStatuses().get(0).getImage();
String version = image.substring(StringUtils.lastIndexOf(image, ":") + 1);
Exec exec = new Exec(apiClient);
String[] commands = new String[3];
commands[0] = "sh";
commands[1] = "-c";
commands[2] = "tail -n " + lines + "/opt/chaosblade/logs/chaosblade.log";
Process process = null;
String result = StringUtils.EMPTY;
try {
process = exec.exec(Objects.requireNonNull(v1Pod.getMetadata()).getNamespace(), v1Pod.getMetadata().getName(), commands, true);
} catch (IOException e) {
throw new RuntimeException(e);
}
result = new String(IoUtil.readBytes(process.getInputStream()));
if (process != null) process.destroy();
Map<String, String> map = new ConcurrentHashMap<>();
map.put(version, result);
resList.add(map);
}
return resList;
}
public static List<String> listDeploymentsByNamespace(ApiClient apiClient, String namespace) throws ApiException {
AppsV1Api appsV1Api = new AppsV1Api(apiClient);
V1DeploymentList v1DeploymentList = appsV1Api.listNamespacedDeployment(namespace, String.valueOf(false), null, null, null, null, null, null, null, 30, null);
if (Objects.isNull(v1DeploymentList) || CollectionUtils.isEmpty(v1DeploymentList.getItems())) return new CopyOnWriteArrayList<>();
List<String> list = v1DeploymentList.getItems().stream().map(item -> item.getMetadata().getName()).collect(Collectors.toList());
return list;
}
public static List<String> getContainersByLabel(ApiClient apiClient, String namespace, String label) throws ApiException {
List<String> containers = new CopyOnWriteArrayList<>();
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, null, null, null, null, label, null, null, null, 300, null);
if (Objects.isNull(v1PodList) || CollectionUtils.isEmpty(v1PodList.getItems())) return new CopyOnWriteArrayList<>();
v1PodList.getItems().stream().filter(item -> "Running".equalsIgnoreCase(Objects.requireNonNull(item.getStatus()).getPhase())).forEach(item -> {
List<V1ContainerStatus> containerStatuses = item.getStatus().getContainerStatuses();
assert containerStatuses != null;
for (V1ContainerStatus containerStatus : containerStatuses) {
containers.add(Objects.requireNonNull(item.getMetadata()).getName() + "*" + containerStatus.getName());
}
});
return containers;
}
public static List<String> getPodsByDeployment(ApiClient apiClient, String namespace, String deployment) throws ApiException {
List<String> resList = new CopyOnWriteArrayList<>();
AppsV1Api appsV1Api = new AppsV1Api(apiClient);
V1Deployment v1Deployment = appsV1Api.readNamespacedDeployment(deployment, namespace, "false");
StringJoiner labelSelectors = new StringJoiner(",");
Map<String, String> matchLabels = Objects.requireNonNull(v1Deployment.getSpec()).getSelector().getMatchLabels();
assert matchLabels != null;
matchLabels.forEach((key, value) -> {
labelSelectors.add(key + "=" + value);
});
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, "false", null, null, null, labelSelectors.toString(), null, null, null, 300, null);
if (Objects.isNull(v1PodList) || CollectionUtils.isEmpty(v1PodList.getItems())) return new CopyOnWriteArrayList<>();
v1PodList.getItems().stream().filter(item -> "Running".equalsIgnoreCase(Objects.requireNonNull(item.getStatus()).getPhase())).forEach(item -> {
String podName = Objects.requireNonNull(item.getMetadata()).getName();
assert podName != null;
String[] split = podName.split("-");
StringJoiner sj = new StringJoiner("-");
for (int i = 0; i < split.length - 2; i++) {
sj.add(split[i]);
}
if (sj.toString().equals(deployment)) {
resList.add(item.getMetadata().getName());
}
});
return resList;
}
public static List<String> getContainersByDeployment(ApiClient apiClient, String namespace, String deployment) throws ApiException {
List<String> resList = new CopyOnWriteArrayList<>();
AppsV1Api appsV1Api = new AppsV1Api(apiClient);
V1Deployment v1Deployment = appsV1Api.readNamespacedDeployment(deployment, namespace, "false");
StringJoiner labelSelectors = new StringJoiner(",");
Map<String, String> matchLabels = Objects.requireNonNull(v1Deployment.getSpec()).getSelector().getMatchLabels();
assert matchLabels != null;
matchLabels.forEach((key, value) -> {
labelSelectors.add(key + "=" + value);
});
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, "false", null, null, null, labelSelectors.toString(), null, null, null, 300, null);
if (Objects.isNull(v1PodList) || CollectionUtils.isEmpty(v1PodList.getItems())) return new CopyOnWriteArrayList<>();
v1PodList.getItems().stream().filter(item -> "Running".equalsIgnoreCase(Objects.requireNonNull(item.getStatus()).getPhase())).forEach(item -> {
String podName = Objects.requireNonNull(item.getMetadata()).getName();
assert podName != null;
String[] split = podName.split("-");
StringJoiner sj = new StringJoiner("-");
for (int i = 0; i < split.length - 2; i++) {
sj.add(split[i]);
}
if (sj.toString().equals(deployment)) {
List<V1ContainerStatus> containerStatuses = item.getStatus().getContainerStatuses();
assert containerStatuses != null;
for (V1ContainerStatus containerStatus : containerStatuses) {
resList.add(item.getMetadata().getName() + "*" + containerStatus.getName());
}
}
});
return resList;
}
public static List<String> getLabelsByNodes(ApiClient apiClient) throws ApiException {
Set<String> set = new ConcurrentHashSet<>();
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1NodeList v1NodeList = coreV1Api.listNode(null, null, null, null, null, null, null, null, 50, null);
if (Objects.isNull(v1NodeList) || CollectionUtils.isEmpty(v1NodeList.getItems())) return new CopyOnWriteArrayList<>();
for (V1Node v1Node : v1NodeList.getItems()) {
Map<String, String> labels = Objects.requireNonNull(v1Node.getMetadata()).getLabels();
assert labels != null;
labels.forEach((key, value) -> {
set.add(key + "=" + value);
});
}
return new CopyOnWriteArrayList<>(set);
}
public static List<String> getLabelsByNamespaces(ApiClient apiClient, String namespace) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, null, null, null, null, null, null, null, null, 300, null);
if (v1PodList != null && !CollectionUtils.isEmpty(v1PodList.getItems())) {
Set<String> set = new HashSet<>();
for (V1Pod item : v1PodList.getItems()) {
if ("Running".equals(Objects.requireNonNull(item.getStatus()).getPhase())) {
Map<String, String> labels = Objects.requireNonNull(item.getMetadata()).getLabels();
assert labels != null;
labels.forEach((key, value) -> {
set.add(key + "=" + value);
});
}
}
if (!CollectionUtils.isEmpty(set)) {
return new CopyOnWriteArrayList<>(set);
}
}
return new CopyOnWriteArrayList<>();
}
public static String getHelpCommand(ApiClient apiClient, String namespace, String podName, String command) throws ApiException {
CoreV1Api coreV1Api = new CoreV1Api(apiClient);
V1PodList v1PodList = coreV1Api.listNamespacedPod(namespace, null, null, null, null, null, null, null, null, 50, null);
if (Objects.isNull(v1PodList)) return StringUtils.EMPTY;
List<V1Pod> list = v1PodList.getItems().stream().filter(item -> "Running".equalsIgnoreCase(Objects.requireNonNull(item.getStatus()).getPhase())).filter(item -> item.getMetadata().getName().contains(podName)).collect(Collectors.toList());
if (CollectionUtils.isEmpty(list)) return StringUtils.EMPTY;
Exec exec = new Exec(apiClient);
String[] commands = new String[3];
commands[0] = "sh";
commands[1] = "-c";
commands[2] = command;
Process process = null;
String helpStr = StringUtils.EMPTY;
try {
process = exec.exec(Objects.requireNonNull(list.get(0).getMetadata()).getNamespace(), list.get(0).getMetadata().getName(), commands, true);
} catch (IOException e) {
throw new RuntimeException(e);
}
helpStr = new String(IoUtil.readBytes(process.getInputStream()));
process.destroy();
helpStr = helpStr.substring(helpStr.indexOf("Examples:"), helpStr.indexOf("Flags:"));
return helpStr;
}
}
执行报错,如下图
解决方案
公有云上组建K8S集群时,通常是使用绑定在节点网卡上的IP地址来部署集群,此时,一般使用的是10.0.x.x这样的私网IP。
使用时,我们可能希望在本地使用K8S集群的kubeconfig文件来访问集群。但记录在kubeconfig文件中的server的IP地址是10.0.x.x这样的私网IP,本地不能直接访问,将kubeconfig文件的server地址改为k8s master的公网IP访问时,会报错:
io.kubernetes.client.openapi.ApiClient@76f2bbc1
Exception in thread "main" io.kubernetes.client.openapi.ApiException: javax.net.ssl.SSLPeerUnverifiedException: Hostname xx.xx.xx.xx not verified:
原因从报错信息可以看出来:是证书对访问的IP做了校验,解决的方法就是要重新制作证书(不用动集群)。
以下操作步骤,以kubeadm部署的集群为例,其他方式部署,操作原理类似,路径可能不同。
-
备份当前k8s集群配置文件
cp -r /etc/kubernetes /etc/kubernetes.bak
-
删除当前k8s集群的apiserver的cert 和 key
rm -rf /etc/kubernetes/pki/apiserver.*
-
生成新的apiserver的cert和key
kubeadm init phase certs apiserver --apiserver-advertise-address ${Internal_IP} --apiserver-cert-extra-sans ${External_IP}
-
刷新admin.conf
kubeadm alpha certs renew admin.conf
-
重启apiserver
kubectl -n kube-system delete pod -l component=kube-apiserver
-
刷新.kube/config
\cp /etc/kubernetes/admin.conf ~/.kube/config
-
注意:这里生成的 admin.conf 文件,server填写的依然是master的内网IP,如果需要放在集群外的客户端上使用,将这个IP改为master的外网IP即可。
sed -i "s/${Internal_IP}/${External_IP}/g" ~/.kube/config
再次执行,结果正常