Prometheus告警设置流程
作者:蓝眼泪
PrometheusAlert有邮件,企业微信,钉钉等等。
timedatectl set-timezone Asia/Shanghai
ntpdate -u 1.cn.pool.ntp.org
systemctl enable ntpd
systemctl enable ntpdate
第一 prometheus告警项目资料地址
github:
https://github.com/feiyu563/PrometheusAlert
国内:
https://gitee.com/feiyu563/PrometheusAlert
共享自定义模板:
https://github.com/feiyu563/PrometheusAlert/issues/30
共享prometheus告警rules:
https://github.com/feiyu563/PrometheusAlert/issues/89
第二 在linux系统中部署alert
1 clone项目源代码
git clone https://github.com/feiyu563/PrometheusAlert.git
2 进入程序目录并运行PrometheusAlert
cd PrometheusAlert/example/linux/
./PrometheusAlert
3 后台运行请执行
nohup /usr/local/PrometheusAlert/example/linux/PrometheusAlert &
4 用浏览器打开网址http://192.168.1.130:8080
在conf/app.conf中有配置账号和密码。
用户名 prometheusalert
密码 prometheusalert
第三 用mysql作为后端数据存储
PrometheusAlert默认使用sqlite3作为后端自定义模板的存储,这种方式适合于单机部署,满足绝大部分生产场景使用。考虑到部分企业对于服务的高可用要求较高,同时也为了让PrometheusAlert更易于横向扩展,用户可以更改PrometheusAlert的默认存储为mysql。(推荐使用mysql 5.7及以上版本)
1.创建数据库
CREATE DATABASE prometheusalert CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
2.利用Navicat或命令行将db目录中的 prometheusalert.sql 导入数据库prometheusalert
use prometheusalert
source prometheusalert.sql
3.开启PrometheusAlert配置文件中关于mysql的配置 conf/app.conf,数据库名称与上面创建的数据一致
#数据库驱动支持sqlite3和mysql,如使用mysql,请开启db_host,db_user,db_password,db_name的注释
db_driver=mysql
db_host=127.0.0.1:3306
db_user=root
db_password=root
db_name=prometheusalert
重启PrometheusAlert,即可完成配置。
第四 企业微信告警和钉钉告警
1 企业微信机器人地址
https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e426580e-9aa1-4f5f-b06d-3591b2309d0b
2 钉钉机器人地址
https://oapi.dingtalk.com/robot/send?access_token=a2584e039c943a53b419e203a185633a224287cb9d7916f23cb8a715d273e0bb
附加信息
nohup /usr/local/PrometheusAlert/example/linux/PrometheusAlert &
/usr/local/alertmanager/alertmanager
cat /usr/local/alertmanager/alertmanager.yml
cat /usr/local/Prometheus/prometheus.yml
cat /usr/local/Prometheus/rules/*_alert.yml
cat /usr/local/Prometheus/rules/*_rules.yml
第五 告警模板和消息协议JSON内容
1 告警模板
{{ $var := .externalURL}}{{ range $k,$v:=.alerts }}
{{if eq $v.status "resolved"}}
## [Prometheus恢复信息]({{$v.generatorURL}})
#### [{{$v.labels.alertname}}]({{$var}})
###### 告警级别:{{$v.labels.level}}
###### 开始时间:{{$v.startsAt}}
###### 结束时间:{{$v.endsAt}}
###### 故障主机IP:{{$v.labels.instance}}
##### {{$v.annotations.description}}

{{else}}
## [Prometheus告警信息]({{$v.generatorURL}})
#### [{{$v.labels.alertname}}]({{$var}})
###### 告警级别:{{$v.labels.level}}
###### 开始时间:{{$v.startsAt}}
###### 结束时间:{{$v.endsAt}}
###### 故障主机IP:{{$v.labels.instance}}
##### {{$v.annotations.description}}

{{end}}
{{ end }}
2 消息协议JSON内容
{
"receiver": "prometheus-alert-center",
"status": "firing",
"alerts": [{
"status": "firing",
"labels": {
"alertname": "TargetDown",
"index": "1",
"instance": "example-1",
"job": "example",
"level": "2",
"service": "example"
},
"annotations": {
"description": "target was down! example dev /example-1 was down for more than 120s.",
"level": "2",
"timestamp": "2020-05-21 02:58:07.829 +0000 UTC"
},
"startsAt": "2020-05-21T02:58:07.830216179Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "https://prometheus-alert-center/graph?g0.expr=up%7Bjob%21%3D%22kubernetes-pods%22%2Cjob%21%3D%22kubernetes-service-endpoints%22%7D+%21%3D+1\u0026g0.tab=1",
"fingerprint": "e2a5025853d4da64"
}],
"groupLabels": {
"instance": "example-1"
},
"commonLabels": {
"alertname": "TargetDown",
"index": "1",
"instance": "example-1",
"job": "example",
"level": "2",
"service": "example"
},
"commonAnnotations": {
"description": "target was down! example dev /example-1 was down for more than 120s.",
"level": "2",
"timestamp": "2020-05-21 02:58:07.829 +0000 UTC"
},
"externalURL": "https://prometheus-alert-center",
"version": "4",
"groupKey": "{}/{job=~\"^(?:.*)$\"}:{instance=\"example-1\"}"
}
第六 告警规则编写
cd /usr/local/Prometheus
mkdir rules
cd rules/
vim …/prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.1.130:9093
rule_files:
- "rules/*_rules.yml"
- "rules/*_alerts.yml"
1 alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['instance']
group_wait: 10m
group_interval: 10s
repeat_interval: 10m
receiver: 'web.hook.prometheusalert'
routes:
- receiver: 'prometheusalert-weixin'
group_wait: 10s
match:
level: '1'
- receiver: 'prometheusalert-dingding'
group_wait: 10s
match:
level: '2'
- receiver: 'prometheusalert-feishu'
group_wait: 10s
match:
level: '3'
- receiver: 'prometheusalert-ruliu'
group_wait: 10s
match:
level: '3'
- receiver: 'prometheusalert-all'
group_wait: 10s
match:
level: '4'
receivers:
- name: 'web.hook.prometheusalert'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/alert'
- name: 'prometheusalert-weixin'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/router?wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e426580e-9aa1-4f5f-b06d-3591b2309d0b'
- name: 'prometheusalert-dingding'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/router?ddurl=https://oapi.dingtalk.com/robot/send?access_token=a2584e039c943a53b419e203a185633a224287cb9d7916f23cb8a715d273e0bb'
- name: 'prometheusalert-feishu'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/router?fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx'
- name: 'prometheusalert-ruliu'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/router?groupid=123456'
- name: 'prometheusalert-all'
webhook_configs:
- url: 'http://192.168.1.130:8080/prometheus/router?wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e426580e-9aa1-4f5f-b06d-3591b2309d0b&ddurl=https://oapi.dingtalk.com/robot/send?access_token=a2584e039c943a53b419e203a185633a224287cb9d7916f23cb8a715d273e0bb&email=136841202@qq.com&phone=13215029093'
2 node_rules.yml
groups:
- name: node_alert
rules:
- alert: 主机CPU告警
expr: node_load1 > 1
labels:
name: prometheusalertcenter
level: 3 #告警级别,告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
annotations:
description: "{{ $labels.instance }} CPU load占用过高" #告警信息
mobile: 13215029093,15888888882,15888888883 #告警发送目标手机号(需要设置电话和短信告警级别)
ddurl: "https://oapi.dingtalk.com/robot/send?access_token=a2584e039c943a53b419e203a185633a224287cb9d7916f23cb8a715d273e0bb,https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" #支持添加多个钉钉机器人告警,用,号分割即可,如果留空或者未填写,则默认发送到配置文件中填写的钉钉器人地址
fsurl: "https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx,https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx" #支持添加多个飞书机器人告警,用,号分割即可,如果留空或者未填写,则默认发送到配置文件中填写的飞书器人地址
wxurl: "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e426580e-9aa1-4f5f-b06d-3591b2309d0b,https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx-xxxx-xxxxxxx-xxxxx" #支持添加多个企业微信机器人告警,用,号分割即可,如果留空或者未填写,则默认发送到配置文件中填写的企业微信机器人地址
email: "136841202@qq.com,45678@baidu.com,91011@aliyun.com" #支持添加多个邮箱告警,用,号分割即可,如果留空或者未填写,则默认发送到配置文件中填写的邮箱地址
groupid: "12345,678910" #支持添加多个如流群id,用,号分割即可,如果留空或者未填写,则默认发送到配置文件中填写的如流群id
3 邮件告警
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qiye.aliyun.cn:465'
smtp_from: 'wenjl@gpelec.cn'
smtp_auth_username: 'wenjl@gpelec.cn'
smtp_auth_password: 'xxxxxxxx'
smtp_require_tls: false
templates:
- 'wechat.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1m
receiver: 'wechat'
routes:
- receiver: mail
match_re:
serverity: mail
receivers:
- name: 'mail'
email_configs:
- to: '136841202@qq.com'
send_resolved: true
- name: 'wechat'
wechat_configs:
- corp_id: 'aaa'
to_party: '1'
to_user: '1'
agent_id: '1000001'
api_secret: 'abcd'
send_resolved: true
4 node-cpu_rules.yml
groups:
- name: node-cpu_alert
rules:
# cpu核数
- record: instance:node_cpus:count
expr: count without (cpu, mode) (node_cpu_seconds_total{mode="idle"})
# 每个cpu使用率
- record: instance_cpu:node_cpu_seconds_not_idle:rate1m
expr: sum without (mode) (1 - rate(node_cpu_seconds_total{mode="idle"}[1m]))
# 总cpu使用率
- record: instance:node_cpu_utilization:ratio
expr: avg without (cpu) (instance_cpu:node_cpu_seconds_not_idle:rate1m)
- alert: cpu使用率大于88%
expr: instance:node_cpu_utilization:ratio * 100 > 88
for: 5m
labels:
severity: critical
level: 3
kind: CpuUsage
annotations:
summary: "cpu使用率大于85%"
description: "主机 {{ $labels.hostname }} 的cpu使用率为 {{ $value | humanize }}"
- alert: cpu使用率大于93%
expr: instance:node_cpu_utilization:ratio * 100 > 93
for: 2m
labels:
severity: emergency
level: 4
kind: CpuUsage
annotations:
summary: "cpu使用率大于93%"
description: "主机 {{ $labels.hostname }} 的cpu使用率为 {{ $value | humanize }}"
wxurl: "webhook1, webhook2"
mobile: "13xxx, 15xxx"
- alert: cpu负载大于Cores
expr: node_load5 > instance:node_cpus:count
for: 5m
labels:
severity: warning
level: 2
kind: CpuLoad
annotations:
summary: "cpu负载大于Cores"
description: "主机 {{ $labels.hostname }} 的cpu负载为 {{ $value }}"
- alert: cpu负载大于2Cores
expr: node_load5 > (instance:node_cpus:count * 2)
for: 2m
labels:
severity: critical
level: 3
kind: CpuLoad
annotations:
summary: "cpu负载大于2Cores"
description: "主机 {{ $labels.hostname }} 的cpu负载为 {{ $value }}"
wxurl: "webhook1, webhook2"
5 node2_rules.yml
groups:
- name: node2_alert # 组的名字,在这个文件中必须要唯一
rules:
- alert: InstanceDown # 告警的名字,在组中需要唯一
expr: up == 0 # 表达式, 执行结果为true: 表示需要告警
for: 1m # 超过多少时间才认为需要告警(即up==0需要持续的时间)
labels:
severity: warning # 定义标签
annotations:
summary: "服务 {{ $labels.instance }} 下线了"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."