Alert manager

本文详细描述了如何在Kubernetes中部署Alertmanager服务和配置Prometheus以监控节点和应用,包括设置告警规则并集成到企业微信的通知系统。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Alert manager

2024 0304 部署

apiVersion: v1
kind: Service
metadata:
  name: server-alter
spec:
  type: NodePort
  ports:
    - name: server
      port: 30006
      targetPort: 9093
      nodePort: 30006
  selector:
    app: server-alter
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: server-alter
spec:
  replicas: 1 # 副本期待数量
  #serviceName: server-alter-headless
  selector:
    matchLabels:
      app: server-alter   # 符合目标的Pod拥有此标签
  template:      # 创建Pod的副本(实例)
    metadata:
      labels:
        app: server-alter  # Pod 副本拥有的标签,对应的RC的Selector
    spec:
#      affinity:
#        nodeAffinity:
#          requiredDuringSchedulingIgnoredDuringExecution:   #node硬亲和
#            nodeSelectorTerms:
#              - matchExpressions:
#                  - key: resource_pool #(标签:资源池)
#                    operator: In
#                    values:
#                      - ty  #(标签:ty/zy01/zy02)
#                  - key: resource_type #(标签:资源类型)
#                    operator: In
#                    values:
#                      - cpu_x86  #(标签:gpu_p40/cpu_x86/cpu_arm)
      containers:     # Pod 内容器的定义部分
        - name: server-alter   # 容器的名称
          image:  prom/alertmanager
          imagePullPolicy: IfNotPresent
          securityContext:  
            runAsUser: 0
            runAsGroup: 0
          ports:
          - containerPort: 9093
#          resources:
#            requests: #pod预留资源
#              memory: "4Gi"
#              cpu: "2"
#            limits: # pod最大资源上限
#              memory: "4Gi"
#              cpu: "2"
          #command: ["sleep","99999d"]
          volumeMounts:
            - name: localtime
              mountPath: /etc/localtime
            - name: cache-shm
              mountPath: /dev/shm
#            - name: data
#              mountPath: "/prometheus"
            - name: config
              mountPath: '/etc/alertmanager/'
      volumes:
        - name: localtime
          hostPath:
            path: /etc/localtime
        - name: cache-shm
          emptyDir:
            medium: Memory
            sizeLimit: 256Mi
        - name: config
          nfs:
            server: 192.168.0.102
            path: /k8s/nfs/alter/alter
#        - name: data
#          nfs:
#            server: 10.142.101.1
#            path: /data2/nfs/data/prometheus-self/data

部署好后

配置prometheus

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: aicluster-prometheus
    zone: k8s
scrape_configs:
  # IaaS
  #  - job_name: 'node-exporter'
  #  scrape_interval: 60s
  #  scrape_timeout: 20s
    #    http_sd_configs:
    #    - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=01
    #    - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=02
    #
#  - job_name: 'node-exporter-dmz'
#    scrape_interval: 60s
#    scrape_timeout: 20s
#    proxy_url: http://10.143.165.70:32000
#    relabel_configs:
#    - target_label: job
#      replacement: node-exporter
#    http_sd_configs:
#    - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=03
  - job_name: "prometheus-cadvisor"
    static_configs:
    - targets: 
      - 192.168.0.101:30003
      - 192.168.0.102:30003
      - 192.168.0.103:30003


  - job_name: 'node-exporter'
    scrape_interval: 15s
    scrape_timeout: 5s
    scheme: http
    static_configs:
    - targets:
      - 192.168.0.101:39100
      - 192.168.0.102:39100
      - 192.168.0.103:39100
      labels:
        env: prod
        project: k8s
        module: k8s
  - job_name: 'kube-state-metrics'
    static_configs:
    - targets: 
      - 192.168.0.101:30004
      labels:
        cluster: k8s
        zone: k8s
        owner: k8s
        instance_type: k8s
  - job_name: 'nginx-vts'
    static_configs:
     - targets: ["192.168.249.81:39913"]     
      

#  - job_name: 'mongodb_exporter'
#    scrape_interval: 15s
#    scrape_timeout: 5s
#    scheme: http
#    static_configs:
#    - targets:
#      - 10.142.150.180:9216
#      - 10.142.150.181:9216
#      - 10.142.150.211:9216
#      labels:
#        env: prod
#        project: aicluster
#        module: aicluster
#
#  - job_name: 'mysql-exporter'                                                           
#    scrape_interval: 15s                                                                   
#    scrape_timeout: 5s                                                                     
#    scheme: http                                                                           
#    static_configs:                                                                        
#    - targets:                                                                             
#      - 10.143.169.27:19104                                                                
#      - 10.143.170.113:19104
#      labels:
#        id: aicluster 
#        env: prod               
#        project: aicluster      
#        module: aicluster
#
#  - job_name: 'nginx-vts-exporter'
#    scrape_interval: 15s
#    scrape_timeout: 5s
#    scheme: http
#    static_configs:
#    - targets:
#      - 10.143.164.201:39913
#      - 10.143.164.202:39913
#      - 10.143.164.203:39913
#      - 10.143.164.204:39913
#      - 10.127.23.240:39913
#      labels:
#        env: prod
#        project: aicluster
# 
alerting:  
  alertmanagers:  
  - static_configs:  
    - targets:  
      - 192.168.249.81:30006
rule_files:  
- "alert-rules-base.yml"  
#- "second_rules.yml"  #告警规则
remote_write:
- url: http://192.168.249.83:8428/api/v1/write
  remote_timeout: 30s

告警规则

groups:
    - name: monitor_base
      rules:
      - alert: CpuUsageAlert_waring
        expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.030
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} CPU usage high"
          description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})"
      - alert: CpuUsageAlert_serious
        #expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85
        expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 2
        for: 3m
        labels:
          level: serious
        annotations:
          summary: "Instance {{ $labels.instance }} CPU usage high"
          description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
      - alert: MemUsageAlert_waring
        expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} MEM usage high"
          description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})"
      - alert: MemUsageAlert_serious
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90
        for: 3m
        labels:
          level: serious
        annotations:
          summary: "Instance {{ $labels.instance }} MEM usage high"
          description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})"
      - alert: DiskUsageAlert_warning
        expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Disk usage high"
          description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})"
      - alert: DiskUsageAlert_serious
        expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90
        for: 3m
        labels:
          level: serious
        annotations:
          summary: "Instance {{ $labels.instance }} Disk usage high"
          description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})"
      - alert: NodeFileDescriptorUsage
        expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} File Descriptor usage high"
          description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})"
      - alert: NodeLoad15
        expr: avg by (instance) (node_load15{}) > 80
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Load15 usage high"
          description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})"
      - alert: NodeAgentStatus
        expr: avg by (instance) (up{}) == 0
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "{{$labels.instance}}: has been down"
          description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})"
      - alert: NodeProcsBlocked
        expr: avg by (instance) (node_procs_blocked{}) > 10
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }}  Process Blocked usage high"
          description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})"
      - alert: NetworkTransmitRate
        #expr:  avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
        expr:  avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
        for: 1m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high"
          description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
      - alert: NetworkReceiveRate
        #expr:  avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
        expr:  avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
        for: 1m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Network Receive Rate usage high"
          description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
      - alert: DiskReadRate
        expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Disk Read Rate usage high"
          description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)"
      - alert: DiskWriteRate
        expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20
        for: 2m
        labels:
          level: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Disk Write Rate usage high"
          description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)"

配置 alert manager 推到企微

global:  
  resolve_timeout: 10m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '2425328600@qq.com'
  smtp_auth_username: '2425328600@qq.com'
  smtp_auth_password: 'smkuagzpmkwcebec'
  smtp_require_tls: false
templates:  
  - '/etc/alertmanager/wechat.tmpl'
  
route:  
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 10m
  receiver: 'wechat'
#  routes:
#  - match_re:
#      severity: critical
#    receiver: 'wechat'
receivers:  
 # - name: 'email'
 #   email_configs:
 #   - to: '2425328600@qq.com'
 #     send_resolved: true
  - name: 'wechat'  
    wechat_configs:  
    - corp_id: 'wwf7444470c0f7986b'    #企微 我的企业-企业ID
      to_party: '2'    # 部门id
      agent_id: '1000002'   # 应用管理 - AgentId
      api_secret: '5Bg1H94IJIQrVffAn4Jh45HsecVuHtuA6vNwNoSaWIs'
      send_resolved: true  
inhibit_rules:  
  - source_match:  
      severity: 'critical'  
    target_match:  
      severity: 'warning'  
    equal: ['alertname', 'dev', 'instance']

······
创建 机器人
企微网页版

在这里插入图片描述

wwf7444470c0f7986b

如果报错— 在这里插入图片描述
这里配置一下
在这里插入图片描述

搭建一个nginx

#server {
#
#     listen  9096;
#     server_name localhost;
#    
#
#   location / {
#       root /opt/openresty/nginx/conf/conf.d/xz;
#       sendfile on;
#       autoindex on;
#       charset utf-8,gbk;
#    }
#}

在这里插入图片描述

将这里的文件发布出去
在这里插入图片描述

把ip 填上

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值