Helm在K8s里部署bitnami/Kube-prometheus监控实现邮件告警

K8s部署bitnami/Kube-prometheus

kube-prometheus-stack大部分镜像都是国外,linux没有vpn会下载失败,这里结合sc存储、helm、kube-prometheus、自定义告警,在K8s里部署Prometheus+alertmanager+邮件告警,操作系统Ubuntu20.04。
主要告警内容:

  • K8s集群告警
  • PostgreSQL告警
  • es告警
  • kube-state-metrics告警

1. 创建StorageClass存储

不需要配置SC存储的可以跳过

1.1 安装NFS

安装nfs服务器创建本地目录并加入nfs共享。

#安装nfs
apt install nfs-kernel-server nfs-common
  
#创建共享目录
mkdir /nfs
mkdir /nfs/k8sdata
  
#配置共享
cat /etc/exports
/nfs *(rw,sync,insecure,no_subtree_check,no_root_squash)
  
#重启服务
systemctl restart rpcbind
systemctl daemon_reload
systemctl enable nfs-kernel-server
systemctl restart nfs-server
1.2 创建yaml配置文件

rbac.yaml

vim rbac.yaml
 
apiVersion: v1
kind: ServiceAccount
metadata:
  name: nfs-client-provisioner
 
  namespace: default
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: nfs-client-provisioner-runner
rules:
  - apiGroups: [""]
    resources: ["persistentvolumes"]
    verbs: ["get", "list", "watch", "create", "delete"]
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "list", "watch", "update"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: run-nfs-client-provisioner
subjects:
  - kind: ServiceAccount
    name: nfs-client-provisioner
 
    namespace: default
roleRef:
  kind: ClusterRole
  name: nfs-client-provisioner-runner
  apiGroup: rbac.authorization.k8s.io
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: leader-locking-nfs-client-provisioner
 
  namespace: default
rules:
  - apiGroups: [""]
    resources: ["endpoints"]
    verbs: ["get", "list", "watch", "create", "update", "patch"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
  name: leader-locking-nfs-client-provisioner
subjects:
  - kind: ServiceAccount
    name: nfs-client-provisioner
 
    namespace: default
roleRef:
  kind: Role
  name: leader-locking-nfs-client-provisioner
  apiGroup: rbac.authorization.k8s.io

nfs-StorageClass.yaml

vim nfs-StorageClass.yaml
 
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: managed-nfs-storage
provisioner: managed-nfs-storage   #记住这个
parameters:
  archiveOnDelete: "false"

nfs-provisioner.yaml

vim nfs-provisioner.yaml
 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: nfs-client-provisioner
  labels:
    app: nfs-client-provisioner
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: nfs-client-provisioner
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: nfs-client-provisioner
  template:
    metadata:
      labels:
        app: nfs-client-provisioner
    spec:
      serviceAccountName: nfs-client-provisioner
      containers:
        - name: nfs-client-provisioner
          image: quay.io/external_storage/nfs-client-provisioner:latest
          volumeMounts:
            - name: nfs-client-root
              mountPath: /persistentvolumes
          env:
            - name: PROVISIONER_NAME
              value: managed-nfs-storage  #与上面的provisioner值一样
            - name: NFS_SERVER
              value: 172.16.12.27        #nfs-server的ip
            - name: NFS_PATH 
              value: /nfs/k8sdata        #nfs-server共享的目录
      volumes:
        - name: nfs-client-root
          nfs:
            server: 172.16.12.27
            path: /nfs/k8sdata
1.3 部署StorageClass存储
kubectl apply -f rbac.yaml
kubectl apply -f nfs-StorageClass.yaml
kubectl apply -f nfs-provisioner.yaml

2. 配置helm3

2.1 下载并安装helm3
#安装helm3
curl https://baltocdn.com/helm/signing.asc | sudo apt-key add -
sudo apt-get install apt-transport-https --yes
echo "deb https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list
sudo apt-get update
sudo apt-get install helm
2.2 添加repo仓库
#添加helm仓库
helm repo add bitnami https://charts.bitnami.com/bitnami
helm update

3. 部署Kube-prometheus

3.1 下载kube-prometheus
helm search repo kube-prometheus
NAME                            CHART VERSION   APP VERSION     DESCRIPTION
bitnami/kube-prometheus         6.5.3           0.53.1          kube-prometheus collects Kubernetes manifests t...
 
#创建minio命名空间
kubectl create namespace kube-prometheus
namespace/kube-prometheus created
 
#拉取kube-Prometheus版本
mkdir /opt/k8s/k8s_application
cd /opt/k8s/k8s_application
helm pull bitnami/kube-prometheus --version 6.5.3
tar -zxvf kube-prometheus-6.5.3.tgz
cd kube-prometheus
3.2 编辑values.yaml配置
#编辑变量
vim values.yaml
18   storageClass: "managed-nfs-storage"  #上面文件部署的是aicloud-nfs-storage
 
prometheus
346 prometheus:
425     type: NodePort
438     nodePort: "30090"                     #prometheus暴露端口
805     storageClass: "managed-nfs-storage"
  
alertmanager
1125 alertmanager:
1199     type: NodePort
1213     nodePort: "30903"                    #alertmanager暴露端口
1499     storageClass: "managed-nfs-storage"  
#企业邮箱配置
1400   config:
1401     global:
1402       resolve_timeout: 5m
1403       smtp_smarthost: 'smtp.exmail.qq.com:465'
1404       smtp_from: 'xxx@xxx.com'
1405       smtp_auth_username: 'xxx@xxx.com'
1406       smtp_auth_password: 'xxx授权码'
1407       smtp_require_tls: false
1408     route:
1409       group_by: ['job']
1410       group_wait: 30s
1411       group_interval: 5m
1412       repeat_interval: 12h
1413       receiver: 'ops'
1414       routes:
1415         - match:
1416             alertname: Watchdog
1417           receiver: 'ops'
1418     receivers:
1419       - name: 'ops'
1420         email_configs:
1421           - to: 'xxx@xxx.com'
1422             send_resolved: true
3.3 helm部署kube-prometheus
root@master:/opt/k8s/k8s_application/kube-prometheus# pwd
/opt/k8s/k8s_application/kube-prometheus
root@master:/opt/k8s/k8s_application/kube-prometheus# ll
total 232
drwxr-xr-x  6 root root   4096 Jan  7 16:30 ./
drwxr-xr-x 10 root root   4096 Jan 10 09:09 ../
-rw-r--r--  1 root root    333 Dec 21 00:59 .helmignore
-rw-r--r--  1 root root    403 Dec 21 00:59 Chart.lock
-rw-r--r--  1 root root   1259 Dec 21 00:59 Chart.yaml
-rw-r--r--  1 root root 108864 Dec 21 00:59 README.md
drwxr-xr-x  5 root root   4096 Jan  7 10:55 charts/
drwxr-xr-x  2 root root   4096 Jan  7 14:55 crds/
drwxr-xr-x  2 root root   4096 Jan 10 09:15 rules/
drwxr-xr-x  6 root root   4096 Jan  7 10:55 templates/
-rw-r--r--  1 root root  87579 Jan  7 16:28 values.yaml

#部署命令
helm install -f values.yaml kube-prometheus bitnami/kube-prometheus --version 6.5.3 -n kube-prometheus

NAME: kube-prometheus
LAST DEPLOYED: Fri Jan  7 11:39:43 2022
NAMESPACE: kube-prometheus
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
CHART NAME: kube-prometheus
CHART VERSION: 6.5.3
APP VERSION: 0.53.1

** Please be patient while the chart is being deployed **

Watch the Prometheus Operator Deployment status using the command:

    kubectl get deploy -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-operator,app.kubernetes.io/instance=kube-prometheus

Watch the Prometheus StatefulSet status using the command:

    kubectl get sts -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-prometheus,app.kubernetes.io/instance=kube-prometheus

Prometheus can be accessed via port "9090" on the following DNS name from within your cluster:

    kube-prometheus-prometheus.kube-prometheus.svc.cluster.local

To access Prometheus from outside the cluster execute the following commands:

    export NODE_PORT=$(kubectl get --namespace kube-prometheus -o jsonpath="{.spec.ports[0].nodePort}" services kube-prometheus-prometheus)
    export NODE_IP=$(kubectl get nodes --namespace kube-prometheus -o jsonpath="{.items[0].status.addresses[0].address}")
    echo "Prometheus URL: http://$NODE_IP:$NODE_PORT/"

Watch the Alertmanager StatefulSet status using the command:

    kubectl get sts -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-alertmanager,app.kubernetes.io/instance=kube-prometheus

Alertmanager can be accessed via port "9093" on the following DNS name from within your cluster:

    kube-prometheus-alertmanager.kube-prometheus.svc.cluster.local

To access Alertmanager from outside the cluster execute the following commands:

    export NODE_PORT=$(kubectl get --namespace kube-prometheus -o jsonpath="{.spec.ports[0].nodePort}" services kube-prometheus-alertmanager)
    export NODE_IP=$(kubectl get nodes --namespace kube-prometheus -o jsonpath="{.items[0].status.addresses[0].address}")
    echo "Alertmanager URL: http://$NODE_IP:$NODE_PORT/"


#删除kube-prometheus
helm delete kube-prometheus -n kube-prometheus


#Prometheus端口为30090
#Alertmanager端口为30903
3.4 查看容器运行情况
root@master:~# kubectl get pods,svc -n kube-prometheus -o wide
NAME                                                      READY   STATUS    RESTARTS   AGE   IP              NODE    NOMINATED NODE   READINESS GATES
pod/alertmanager-kube-prometheus-alertmanager-0           2/2     Running   0          11m   10.244.135.55   node3   <none>           <none>
pod/kube-prometheus-kube-state-metrics-86b78f7c66-c5cdg   1/1     Running   0          11m   10.244.135.14   node3   <none>           <none>
pod/kube-prometheus-node-exporter-mpjn6                   1/1     Running   0          11m   172.16.12.29    node3   <none>           <none>
pod/kube-prometheus-operator-6fc5dbc49-g2l25              1/1     Running   0          11m   10.244.135.18   node3   <none>           <none>
pod/prometheus-kube-prometheus-prometheus-0               2/2     Running   0          11m   10.244.135.1    node3   <none>           <none>

NAME                                         TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                      AGE   SELECTOR
service/alertmanager-operated                ClusterIP   None             <none>        9093/TCP,9094/TCP,9094/UDP   11m   app.kubernetes.io/name=alertmanager
service/kube-prometheus-alertmanager         NodePort    10.108.168.165   <none>        9093:30903/TCP               11m   alertmanager=kube-prometheus-alertmanager,app.kubernetes.io/name=alertmanager
service/kube-prometheus-kube-state-metrics   ClusterIP   10.103.31.242    <none>        8080/TCP                     11m   app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=kube-state-metrics
service/kube-prometheus-node-exporter        ClusterIP   10.107.0.118     <none>        9100/TCP                     11m   app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=node-exporter
service/kube-prometheus-operator             ClusterIP   10.110.167.59    <none>        8080/TCP                     11m   app.kubernetes.io/component=operator,app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=kube-prometheus
service/kube-prometheus-prometheus           NodePort    10.103.120.125   <none>        9090:30090/TCP               11m   app.kubernetes.io/name=prometheus,prometheus=kube-prometheus-prometheus
service/prometheus-operated                  ClusterIP   None             <none>        9090/TCP                     11m   app.kubernetes.io/name=prometheus
3.5 查看Prometheus和Alertmanager

IP为k8s-master的IP
Prometheus:http://172.16.13.55:30090/targets
Alertmanager:http://172.16.13.55:30903/#/alerts

3.6 配置Prometheus

1)打开Prometheus查看监控会发现kube-proxy失败。
在这里插入图片描述
2)这是因为默认情况下,该服务监听地址只提供给127.0.0.1,需修改为0.0.0.0。
打开Dashboard搜索kube-proxy,找到Config Maps点击编辑,或者命令行修改也可以。
在这里插入图片描述
3)将原来的127.0.0.1改为0.0.0.0。
在这里插入图片描述
4)删除对应的所有节点上的kube-proxy,让他们重新启动应用修改后的配置文件。重启完成后刷新Prometheus即可看到
http://172.16.13.55:30090/targets
在这里插入图片描述

4. 自定义告警内容

在kube-prometheus目录下创建rules文件夹,所有自定义告警内容都存放在此文件夹里。

cd /opt/k8s/k8s_application/kube-prometheus
mkdir rules
cd rules
4.1 例子

这里举个例子,其他告警可以网上搜索创建应用即可。

#这是针对PostgreSQL数据库的告警内容
vim postgresql-prometheusrule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus-name: kube-prometheus-prometheus
    managed-by: prometheus-operator
  name: prometheus-pgsql-rules    #全部都是固定的,只有这一行name自定义
  namespace:  kube-prometheus
 
#下面是自定义告警内容
spec:
  groups:
    - name: prometheus-exporter-postgresql.rules
      rules:
      - alert: PostgreSQLDown
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          message: "PostgreSQL instance {{ $labels.prometheus_exporter }} is DOWN"
      - alert: PostgreSQLConnectionsHigh
        expr: (sum(pg_stat_database_numbackends) by (namespace,service,prometheus_exporter)) / on (namespace,service,prometheus_exporter) (pg_settings_max_connections) > 0.8
        for: 1m
        labels:
          severity: warning
        annotations:
          message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has High number of connections"
      - alert: PostgreSQLQueriesPerSecondHigh
        expr: avg(rate(pg_stat_database_xact_commit[5m]) + rate(pg_stat_database_xact_rollback[5m])) by (namespace,service,prometheus_exporter,datname) > 10000
        for: 1m
        labels:
          severity: warning
        annotations:
          message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has High number of queries per second"
      - alert: PostgreSQLCacheHitRateLow
        expr: (sum(irate(pg_stat_database_xact_commit[5m])) by (namespace,service,prometheus_exporter) + sum(irate(pg_stat_database_xact_rollback{datname!~"template.*|"}[5m])) by (namespace,service,prometheus_exporter)) > 10000
        for: 1m
        labels:
          severity: warning
        annotations:
          message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has Low cache hit rate"
 
#应用告警
kubectl apply -f postgresql-prometheusrule.yaml
4.2 其他自定义告警

这里找了些网上的告警,包含了大部分需求,有特别需求可以去对应服务官方文档查找资料。

#下载
https://gitee.com/songfei123123/kube-prometheus-rules
cd rules
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/postgresql-prometheusrule.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/kube-state-metrics-rules.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/k8s-rules.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/es-prometheusrule.yaml

#部署
kubectl apply -f es-prometheusrule.yaml           #es告警规则
kubectl apply -f k8s-rules.yaml                   #k8s集群告警规则
kubectl apply -f kube-state-metrics-rules.yaml    #k8s官方文档的kube-state-metrics告警规则
kubectl apply -f postgresql-prometheusrule.yaml   #pg数据库的告警规则

k8s官方:https://awesome-prometheus-alerts.grep.to/rules#kubernetes
github他人参考:https://github.com/3scale-ops/prometheus-exporter-operator/tree/main/prometheus-rules
在这里插入图片描述

4.3 进入Dashboard查看configmap是否生效

url:https://k8s-master-ip:30008/#/overview?namespace=kube-prometheus

1)所有新增自定义的告警规则都会附加在该Config Maps里面。
在这里插入图片描述

2)默认不创建自定义告警规则只有上面这一部分。
在这里插入图片描述
3)创建了告警规则后下面会新增内容。
在这里插入图片描述

4.4 查看Prometheus告警规则

http://172.16.13.55:30090/alerts 需要等待一会再刷新
在这里插入图片描述

4.5 查看Alertmanager

http://172.16.13.55:30903/#/alerts
可以看到对应的两个告警
在这里插入图片描述

4.6 查看告警邮件

在这里插入图片描述
在这里插入图片描述

<think>我们使用Helm将应用发布到Kubernetes后,监控Pod启动状态的方法主要有以下几种: ### 方法1:使用`helm status`命令 Helm提供了`helm status`命令来查看发布的状态。在Helm 3中,默认情况下,Helm会等待部署完成(即所有Pod就绪)。我们可以通过以下步骤操作: ```bash # 安装或升级后立即检查状态 helm upgrade --install my-release ./my-chart --namespace my-namespace --wait # 使用--wait参数会等待直到所有Pod处于Ready状态 # 然后手动检查状态 helm status my-release --namespace my-namespace ``` 输出将显示每个部署资源的状态,包括Deployment、Pod等。 ### 方法2:结合kubectl命令监控Helm命令执行后,我们可以使用kubectl命令来监控Pod的状态: ```bash # 监控指定release的所有Pod状态 kubectl get pods -n my-namespace -l app.kubernetes.io/instance=my-release --watch # 或者使用rollout命令监控Deployment的滚动更新 kubectl rollout status deployment/my-deployment -n my-namespace ``` 注意:这假设你的Helm chart中Deployment的名称为`my-deployment`,实际使用时需要替换。 ### 方法3:在Helm chart中添加钩子(Hook) 我们可以利用Helm的生命周期钩子在安装/升级后自动执行状态检查。在chart的`templates/`目录下创建钩子脚本: ```yaml # templates/post-install-check.yaml apiVersion: batch/v1 kind: Job metadata: name: "{{ .Release.Name }}-post-install-check" annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "5" "helm.sh/hook-delete-policy": before-creation spec: template: spec: containers: - name: check image: bitnami/kubectl:latest command: ['sh', '-c', "kubectl rollout status deployment/my-deployment -n {{ .Release.Namespace }}"] restartPolicy: Never backoffLimit: 0 ``` 这个钩子会在安装或升级后执行,使用kubectl镜像来检查Deployment的状态。注意,需要确保运行这个钩子的Pod有足够的RBAC权限。 ### 方法4:集成到CI/CD中(如Jenkins) 在Jenkins pipeline中,我们可以这样写: ```groovy stage('Deploy with Helm') { steps { sh 'helm upgrade --install my-release ./my-chart -n my-namespace --wait --timeout 5m' } } stage('Verify Pods') { steps { script { def podStatus = sh(script: "kubectl get pods -n my-namespace -l app.kubernetes.io/instance=my-release -o jsonpath='{.items[*].status.phase}'", returnStdout: true).trim() if (!podStatus.contains("Running")) { error("Pods are not running: ${podStatus}") } } } } ``` ### 方法5:利用Prometheus监控(长期监控) 如果已经部署Prometheus,我们可以通过配置ServiceMonitor来监控应用Pod的指标,并在Grafana中展示。步骤包括: 1. 在应用的Helm chart中暴露Prometheus指标端口和添加注解(如`prometheus.io/scrape: "true"`)。 2. 配置ServiceMonitor资源(如果使用Prometheus Operator)[^2]。 3. 在Grafana中创建仪表盘,监控Pod的状态和健康指标。 ### 监控Pod启动的关键指标 - **Pod状态**:通过`kubectl get pods`查看Pod的`STATUS`和`READY`列。 - **事件日志**:通过`kubectl describe pod <pod-name>`查看事件,排查启动失败的原因。 - **容器日志**:通过`kubectl logs <pod-name>`查看容器日志。 ### 常见问题排查 - **Pending状态**:通常由于资源不足或调度问题,检查节点资源、污点和容忍配置。 - **CrashLoopBackOff**:容器启动后立即退出,查看容器日志定位问题[^4]。 - **ImagePullBackOff**:镜像拉取失败,检查镜像地址、拉取密钥。 ### 示例:Helm chart中配置健康检查 在chart的deployment.yaml中定义就绪探针(readinessProbe)和存活探针(livenessProbe): ```yaml containers: - name: my-app image: my-app:v1 readinessProbe: httpGet: path: /health port: 8080 initialDelaySeconds: 10 periodSeconds: 5 livenessProbe: tcpSocket: port: 8080 initialDelaySeconds: 15 periodSeconds: 20 ``` 这样Kubernetes会根据健康检查自动判断Pod是否就绪。 ### 总结 Helm部署监控Pod启动状态的方法: 1. 使用`helm install/upgrade`的`--wait`参数等待Pod就绪 2. 使用`helm status`查看发布状态 3. 结合`kubectl rollout status`或`kubectl get pods`命令监控 4. 通过Helm钩子自动执行状态检查 5. 集成到CI/CD流程中 6. 配置Prometheus实现长期监控 这些方法可以单独使用,也可以组合使用以提高可靠性。 --- ### §§相关问题§§ 1. Helm中如何使用`--atomic`参数实现自动回滚? 2. 如何配置Prometheus Operator来监控自定义应用? 3. Kubernetes中Pod启动失败有哪些常见原因及解决方案? </think>### Helm部署应用到Kubernetes并监控Pod启动状态的完整方案 #### 一、核心部署监控流程 ```bash # 1. 使用Helm部署应用(带等待参数) helm upgrade --install my-app ./my-chart \ --namespace production \ --wait \ # 等待Pod启动完成 --timeout 600s \ # 超时设置 --set image.tag=v1.2.3 # 2. 直接监控Pod启动状态 kubectl get pods -n production \ -l app.kubernetes.io/instance=my-app \ --watch \ # 实时监控 -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[0].ready ``` #### 二、Helm Chart集成健康检查(关键配置) 在`values.yaml`中添加: ```yaml # Helm Chart健康检查配置 readinessProbe: httpGet: path: /health port: 8080 initialDelaySeconds: 10 periodSeconds: 5 livenessProbe: tcpSocket: port: 8080 initialDelaySeconds: 15 periodSeconds: 20 ``` 在部署模板`templates/deployment.yaml`中应用: ```yaml apiVersion: apps/v1 kind: Deployment spec: template: spec: containers: - name: main-app readinessProbe: {{ .Values.readinessProbe | toYaml | nindent 10 }} livenessProbe: {{ .Values.livenessProbe | toYaml | nindent 10 }} ``` #### 三、Jenkins自动化监控流水线 ```groovy pipeline { agent any stages { stage('Helm Deploy') { steps { sh ''' helm upgrade --install my-app ./helm-chart \ --namespace ${NAMESPACE} \ --wait \ --timeout 600s \ --set replicaCount=3 ''' } } stage('Monitor Pods') { steps { script { // 检查部署状态 def rollout = sh( script: "kubectl rollout status deployment/my-app -n ${NAMESPACE} --timeout=300s", returnStatus: true ) if (rollout != 0) { // 获取故障详情 sh "kubectl get events -n ${NAMESPACE} --sort-by='.metadata.creationTimestamp'" sh "kubectl describe pods -l app.kubernetes.io/instance=my-app -n ${NAMESPACE}" error("🚨 Pod启动失败!") } // 验证所有Pod就绪 def readyPods = sh( script: "kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/instance=my-app -o jsonpath='{.items[*].status.containerStatuses[0].ready}' | tr ' ' '\\n' | grep -c true", returnStdout: true ).trim() if (readyPods != "3") { error("⚠️ 未全部就绪!就绪Pod数: ${readyPods}/3") } } } } } post { always { // 收集监控数据 sh "kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/instance=my-app -o wide > pod-status.log" archiveArtifacts 'pod-status.log' } } } ``` #### 四、Prometheus监控集成(Helm方式) 1. **添加Prometheus监控注解**(在`values.yaml`中): ```yaml metrics: enabled: true port: 9144 path: /metrics podAnnotations: prometheus.io/scrape: "true" prometheus.io/port: "{{ .Values.metrics.port }}" ``` 2. **部署Prometheus Operator**(使用Helm): ```bash # 添加仓库 helm repo add prometheus-community https://prometheus-community.github.io/helm-charts # 安装Operator helm install prometheus prometheus-community/kube-prometheus-stack \ -n monitoring \ --set alertmanager.enabled=false \ # 解决引用[2]的Alertmanager问题 --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false ``` 3. **创建ServiceMonitor**: ```yaml # templates/servicemonitor.yaml apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: {{ .Release.Name }}-monitor spec: selector: matchLabels: app.kubernetes.io/instance: {{ .Release.Name }} endpoints: - port: metrics interval: 15s ``` #### 五、故障排查指南 | **故障现象** | **诊断命令** | **解决方案** | |---------------------|---------------------------------------------|----------------------------------| | Pod处于Pending状态 | `kubectl describe pod <name> -n <ns>` | 检查资源配额/节点污点 | | CrashLoopBackOff | `kubectl logs <pod> --previous` | 查看前次崩溃日志[^4] | | Readiness失败 | `kubectl exec <pod> -- curl localhost:8080/health` | 验证应用健康端点 | | 镜像拉取失败 | `kubectl get events -n <ns> --field-selector involvedObject.name=<pod>` | 检查镜像权限/密钥 | #### 六、最佳实践建议 1. **滚动更新策略**: ```yaml # deployment.yaml strategy: type: RollingUpdate rollingUpdate: maxSurge: 25% maxUnavailable: 0 ``` 2. **使用Helm Hook实现预检查**: ```yaml # templates/pre-upgrade-check.yaml apiVersion: batch/v1 kind: Job metadata: name: pre-upgrade-check annotations: "helm.sh/hook": pre-upgrade spec: template: spec: containers: - name: checker image: bitnami/kubectl command: ["kubectl", "get", "nodes", "-o", "jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}'"] restartPolicy: Never ``` 3. **自动回滚机制**: ```bash # 升级失败自动回滚 helm upgrade my-app ./chart \ --atomic \ # 失败自动回滚 --cleanup-on-fail ``` ###
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值