Alert manager
2024 0304 部署
apiVersion: v1
kind: Service
metadata:
name: server-alter
spec:
type: NodePort
ports:
- name: server
port: 30006
targetPort: 9093
nodePort: 30006
selector:
app: server-alter
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: server-alter
spec:
replicas: 1 # 副本期待数量
#serviceName: server-alter-headless
selector:
matchLabels:
app: server-alter # 符合目标的Pod拥有此标签
template: # 创建Pod的副本(实例)
metadata:
labels:
app: server-alter # Pod 副本拥有的标签,对应的RC的Selector
spec:
# affinity:
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution: #node硬亲和
# nodeSelectorTerms:
# - matchExpressions:
# - key: resource_pool #(标签:资源池)
# operator: In
# values:
# - ty #(标签:ty/zy01/zy02)
# - key: resource_type #(标签:资源类型)
# operator: In
# values:
# - cpu_x86 #(标签:gpu_p40/cpu_x86/cpu_arm)
containers: # Pod 内容器的定义部分
- name: server-alter # 容器的名称
image: prom/alertmanager
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
runAsGroup: 0
ports:
- containerPort: 9093
# resources:
# requests: #pod预留资源
# memory: "4Gi"
# cpu: "2"
# limits: # pod最大资源上限
# memory: "4Gi"
# cpu: "2"
#command: ["sleep","99999d"]
volumeMounts:
- name: localtime
mountPath: /etc/localtime
- name: cache-shm
mountPath: /dev/shm
# - name: data
# mountPath: "/prometheus"
- name: config
mountPath: '/etc/alertmanager/'
volumes:
- name: localtime
hostPath:
path: /etc/localtime
- name: cache-shm
emptyDir:
medium: Memory
sizeLimit: 256Mi
- name: config
nfs:
server: 192.168.0.102
path: /k8s/nfs/alter/alter
# - name: data
# nfs:
# server: 10.142.101.1
# path: /data2/nfs/data/prometheus-self/data
部署好后
配置prometheus
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: aicluster-prometheus
zone: k8s
scrape_configs:
# IaaS
# - job_name: 'node-exporter'
# scrape_interval: 60s
# scrape_timeout: 20s
# http_sd_configs:
# - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=01
# - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=02
#
# - job_name: 'node-exporter-dmz'
# scrape_interval: 60s
# scrape_timeout: 20s
# proxy_url: http://10.143.165.70:32000
# relabel_configs:
# - target_label: job
# replacement: node-exporter
# http_sd_configs:
# - url: http://10.37.68.130:38000/api/bkcmdb/nodeTargets?job_name=node-exporter&zone=03
- job_name: "prometheus-cadvisor"
static_configs:
- targets:
- 192.168.0.101:30003
- 192.168.0.102:30003
- 192.168.0.103:30003
- job_name: 'node-exporter'
scrape_interval: 15s
scrape_timeout: 5s
scheme: http
static_configs:
- targets:
- 192.168.0.101:39100
- 192.168.0.102:39100
- 192.168.0.103:39100
labels:
env: prod
project: k8s
module: k8s
- job_name: 'kube-state-metrics'
static_configs:
- targets:
- 192.168.0.101:30004
labels:
cluster: k8s
zone: k8s
owner: k8s
instance_type: k8s
- job_name: 'nginx-vts'
static_configs:
- targets: ["192.168.249.81:39913"]
# - job_name: 'mongodb_exporter'
# scrape_interval: 15s
# scrape_timeout: 5s
# scheme: http
# static_configs:
# - targets:
# - 10.142.150.180:9216
# - 10.142.150.181:9216
# - 10.142.150.211:9216
# labels:
# env: prod
# project: aicluster
# module: aicluster
#
# - job_name: 'mysql-exporter'
# scrape_interval: 15s
# scrape_timeout: 5s
# scheme: http
# static_configs:
# - targets:
# - 10.143.169.27:19104
# - 10.143.170.113:19104
# labels:
# id: aicluster
# env: prod
# project: aicluster
# module: aicluster
#
# - job_name: 'nginx-vts-exporter'
# scrape_interval: 15s
# scrape_timeout: 5s
# scheme: http
# static_configs:
# - targets:
# - 10.143.164.201:39913
# - 10.143.164.202:39913
# - 10.143.164.203:39913
# - 10.143.164.204:39913
# - 10.127.23.240:39913
# labels:
# env: prod
# project: aicluster
#
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.249.81:30006
rule_files:
- "alert-rules-base.yml"
#- "second_rules.yml" #告警规则
remote_write:
- url: http://192.168.249.83:8428/api/v1/write
remote_timeout: 30s
告警规则
groups:
- name: monitor_base
rules:
- alert: CpuUsageAlert_waring
expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.030
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} CPU usage high"
description: "{{ $labels.instance }} CPU usage above 60% (current value: {{ $value }})"
- alert: CpuUsageAlert_serious
#expr: sum(avg(irate(node_cpu_seconds_total{mode!='idle'}[5m])) without (cpu)) by (instance) > 0.85
expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{job=~".*",mode="idle"}[5m])) * 100)) > 2
for: 3m
labels:
level: serious
annotations:
summary: "Instance {{ $labels.instance }} CPU usage high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: MemUsageAlert_waring
expr: avg by(instance) ((1 - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes) * 100) > 70
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} MEM usage high"
description: "{{$labels.instance}}: MEM usage is above 70% (current value is: {{ $value }})"
- alert: MemUsageAlert_serious
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90
for: 3m
labels:
level: serious
annotations:
summary: "Instance {{ $labels.instance }} MEM usage high"
description: "{{ $labels.instance }} MEM usage above 90% (current value: {{ $value }})"
- alert: DiskUsageAlert_warning
expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 80
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Disk usage high"
description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }})"
- alert: DiskUsageAlert_serious
expr: (1 - node_filesystem_free_bytes{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size_bytes) * 100 > 90
for: 3m
labels:
level: serious
annotations:
summary: "Instance {{ $labels.instance }} Disk usage high"
description: "{{$labels.instance}}: Disk usage is above 90% (current value is: {{ $value }})"
- alert: NodeFileDescriptorUsage
expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 60
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} File Descriptor usage high"
description: "{{$labels.instance}}: File Descriptor usage is above 60% (current value is: {{ $value }})"
- alert: NodeLoad15
expr: avg by (instance) (node_load15{}) > 80
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Load15 usage high"
description: "{{$labels.instance}}: Load15 is above 80 (current value is: {{ $value }})"
- alert: NodeAgentStatus
expr: avg by (instance) (up{}) == 0
for: 2m
labels:
level: warning
annotations:
summary: "{{$labels.instance}}: has been down"
description: "{{$labels.instance}}: Node_Exporter Agent is down (current value is: {{ $value }})"
- alert: NodeProcsBlocked
expr: avg by (instance) (node_procs_blocked{}) > 10
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Process Blocked usage high"
description: "{{$labels.instance}}: Node Blocked Procs detected! above 10 (current value is: {{ $value }})"
- alert: NetworkTransmitRate
#expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
expr: avg by (instance) (floor(irate(node_network_transmit_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
for: 1m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Network Transmit Rate usage high"
description: "{{$labels.instance}}: Node Transmit Rate (Upload) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
- alert: NetworkReceiveRate
#expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{device="ens192"}[2m]) / 1024 / 1024)) > 50
expr: avg by (instance) (floor(irate(node_network_receive_bytes_total{}[2m]) / 1024 / 1024 * 8 )) > 40
for: 1m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Network Receive Rate usage high"
description: "{{$labels.instance}}: Node Receive Rate (Download) is above 40Mbps/s (current value is: {{ $value }}Mbps/s)"
- alert: DiskReadRate
expr: avg by (instance) (floor(irate(node_disk_read_bytes_total{}[2m]) / 1024 )) > 200
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Disk Read Rate usage high"
description: "{{$labels.instance}}: Node Disk Read Rate is above 200KB/s (current value is: {{ $value }}KB/s)"
- alert: DiskWriteRate
expr: avg by (instance) (floor(irate(node_disk_written_bytes_total{}[2m]) / 1024 / 1024 )) > 20
for: 2m
labels:
level: warning
annotations:
summary: "Instance {{ $labels.instance }} Disk Write Rate usage high"
description: "{{$labels.instance}}: Node Disk Write Rate is above 20MB/s (current value is: {{ $value }}MB/s)"
配置 alert manager 推到企微
global:
resolve_timeout: 10m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '2425328600@qq.com'
smtp_auth_username: '2425328600@qq.com'
smtp_auth_password: 'smkuagzpmkwcebec'
smtp_require_tls: false
templates:
- '/etc/alertmanager/wechat.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: 'wechat'
# routes:
# - match_re:
# severity: critical
# receiver: 'wechat'
receivers:
# - name: 'email'
# email_configs:
# - to: '2425328600@qq.com'
# send_resolved: true
- name: 'wechat'
wechat_configs:
- corp_id: 'wwf7444470c0f7986b' #企微 我的企业-企业ID
to_party: '2' # 部门id
agent_id: '1000002' # 应用管理 - AgentId
api_secret: '5Bg1H94IJIQrVffAn4Jh45HsecVuHtuA6vNwNoSaWIs'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
······
创建 机器人
企微网页版
如果报错—
这里配置一下
搭建一个nginx
#server {
#
# listen 9096;
# server_name localhost;
#
#
# location / {
# root /opt/openresty/nginx/conf/conf.d/xz;
# sendfile on;
# autoindex on;
# charset utf-8,gbk;
# }
#}
将这里的文件发布出去
把ip 填上