创建prometheus.yml
~]# vim prometheus.yml
global:
scrape_interval: 60s
evaluation_interval: 60s
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
rule_files:
- "/data/docker/prometheus/rule.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ["localhost:9091"]
- job_name: 'node-exporter'
static_configs:
- targets: ["localhost:9100"]
- job_name: 'alertmanager'
static_configs:
- targets: ["localhost:9093"]
运行Prometheus容器
docker run -d -p 9091:9090 --name prometheus --restart=always -v /data/docker/prometheus/data:/data -v /data/docker/prometheus/prometheus.yml:/data/prometheus.yml -v /data/docker/prometheus/rule.yml:/data/rule.yml prom/prometheus --config.file=/data/prometheus.yml --web.enable-lifecycle --storage.tsdb.retention=90d
创建rule.yml
# 这里为例方便测试设置为up==1
~]# vim rule.yml
groups:
- name: Hosts.rules
rules:
- alert: HostDown
expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
for: 0m
labels:
severity: ERROR
annotations:
title: 'Instance down'
summary: "{{$labels.instance}}"
description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"
## 注意==0为报警条件,当有服务宕机测会告警,如果测试需改为1
运行node-exporter容器
docker run -d --name node-exporter --restart=always -p 9100:9100 -v "/proc:/host/proc:ro" -v "/sys:/host/sys:ro" -v "/:/rootfs:ro" prom/node-exporter
创建告警配置文件
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '1017096373@qq.com'
smtp_auth_username: '1017096373@qq.com'
smtp_auth_password: 'xfjoexdzymztbjac' # 注意不是邮件密码
smtp_require_tls: false
smtp_hello: 'qq.com'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '1017096373@qq.com'
send_resolved: true
运行告警容器
docker run -d -p 9093:9093 --restart always --name alertmanager -v /data/docker/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager