AlertManager配置
- 下载解压alertmanager
- 配置启动文件
[Unit]
Description=alertmanager
[Service]
WorkingDirectory=/usr/local/alertmanager/
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data --web.listen-address=:9093 --data.retention=120h
Restart=on-failure
[Install]
WantedBy=multi-user.target
- 修改配置文件
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.139.com:465'
smtp_from: 'xxxxxx@139.com'
smtp_auth_username: 'xxxxx'
smtp_auth_password: 'xxxxx' # 开启smtp后授权的密码
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 30s
repeat_interval: 10m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: 'xxxxx@qq.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
检查配置文件
[root@hadoop01 alertmanager]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route - 1 inhibit rules
- 1 receivers
- 0 templates
进入浏览器 ip+9093 查看界面
- 创建规则
cd /usr/local/prometheus/
mkdir rules
touch node_alerts.yml node_rules.yml
node_rules.yml
groups:
- name: node_rules
# interval: 15s
rules:
- record: instance:node_cpu_usage
expr: 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (nodename)*100
labels:
metric_type: cpu_monitor
- record: instance:node_mem_usage
expr: 100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) * 100
labels:
metirc_type: Memory_monitor
node_alerts.yml
groups:
- name: node_alerts
rules:
- alert: cpu_usage_over_threshold
expr: instance:node_cpu_usage > 80
for: 1m # 等待一秒
labels:
serverity: waring
annotations:
summary: 主机 {{ $labels.nodename }} 的cpu使用率持续1分钟超出阈值,当前为 {{ $value }} %
- alert: mem_usage_over_threshold
expr: instance:node_mem_usage > 80
for: 1m
labels:
serverity: waring
annotations:
summary: 主机 {{ $labels.nodename }} 的用率持续1分钟超出阈值,当前为 {{ $value }} %
检查配置文件
[root@hadoop01 prometheus]# ./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 2 rule files found
Checking rules/node_rules.yml
SUCCESS: 2 rules found
Checking rules/node_alerts.yml
SUCCESS: 2 rules found
加载配置文件
curl -X POST http://localhost:9090/-/reload