1、
rules:
groups:
- name: example
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
serverity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: NodeMemoryUsage
expr: ((node_memory_MemTotal) - (node_memory_MemFree+node_memory_Buffers+node_memory_Cached) ) / (node_memory_MemTotal) * 100 > 50
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: High Memory usage detected"
description: "{{$labels.instance}}: Memory usage is above 50% (current value is: {{ $value }}"
- alert: PodMemory
expr: sum(container_memory_rss{image!=""}) by(pod_name, namespace) / 1024/ 1024 / 1024 > 4
for: 2m
labels:
team: pod
annotations:
summary: "{{$labels.instance}}: High Memory detected"
description: "{{$labels.instance}}: Memory is above 4G (current value is: {{ $value }}"
2、
prometheus.yml
rule_files:
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
- job_name: jvm-pods
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true;true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- __meta_kubernetes_service_annotation_prometheus_io_jvm_scrape
- action: replace
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_app_metrics_patn
target_label: __metrics_path__
regex: (.+)
- action: replace
source_labels:
- __meta_kubernetes_pod_ip
- __meta_kubernetes_service_annotation_prometheus_io_app_metrics_port
target_label: __address__
regex: (.+);(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: nginx
static_configs:
- targets: ['10.147.255.12:80','10.147.255.12:81','10.147.255.12:82','10.147.255.12:83','10.147.255.12:84']
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- honor_labels: true
job_name: prometheus-pushgateway
kubernetes_sd_configs:
- role: service
relabel_configs:
- action: keep
regex: pushgateway
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- job_name: kubernetes-services
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module:
- http_2xx
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels:
- __address__
target_label: __param_target
- replacement: blackbox
target_label: __address__
- source_labels:
- __param_target
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: prometheus
action: keep
- source_labels: [__meta_kubernetes_pod_label_app]
regex: prometheus
action: keep
- source_labels: [__meta_kubernetes_pod_label_component]
regex: alertmanager
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number]
regex:
action: drop
3、
alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:465' # 邮箱smtp服务器代理
smtp_from: '******@qq.com' # 发送邮箱名称
smtp_auth_username: '******@qq.com' # 邮箱名称
smtp_auth_password: '****' # 邮箱授权码
smtp_require_tls: false
# 定义模板信息
templates:
- '*.tmpl'
route:
group_by: ['alertname'] # 报警分组依据
group_wait: 10s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 10s # 在发送新警报前的等待时间
repeat_interval: 5m # 发送重复警报的周期 对于email配置中,此项不可以设置过低,否则将会由于邮件发送太多频繁,被smtp服务器拒绝
receiver: 'mail' # 发送警报的接收者的名称,以下receivers name的名称
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
- name: 'mail'
email_configs:
- to: ******@qq.com
html: '{{ template "mail.html" . }}' # 设定邮箱的内容模板
headers: { Subject: "[WARN] 报警邮件"} # 接收邮件的标题
- name: 'wechat'
wechat_configs: # 企业微信报警配置
- send_resolved: true
to_user: '11066879'
#to_party: '1' # 接收组的id
agent_id: '******' # (企业微信-->自定应用-->AgentId)
corp_id: '******' # 企业信息(我的企业-->CorpId[在底部])
api_secret: '******' # 企业微信(企业微信-->自定应用-->Secret)
message: '{{ template "wechat.html" . }}' # 发送消息模板的设定
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
4、
mail.tmpl
{{ define "mail.html" }}
<table border="1">
<tr>
<td>报警项</td>
<td>实例</td>
<td>JOB</td>
<td>namespace</td>
<td>kubernetes_pod_name</td>
<td>container_name</td>
<td>描述</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Labels "job" }}</td>
<td>{{ index $alert.Labels "kubernetes_namespace" }}</td>
<td>{{ index $alert.Labels "pod_name" }}</td>
<td>{{ index $alert.Labels "container_name" }}</td>
<td>{{ index $alert.Annotations "description" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
5、
wechat.tmpl
{{ define "wechat.html" }}
{{ range $i, $alert := .Alerts.Firing }}
[报警项]:{{ index $alert.Labels "alertname" }}
[实例]:{{ index $alert.Labels "instance" }}
[JOB]:{{ index $alert.Labels "job" }}
[namespace]:{{ index $alert.Labels "kubernetes_namespace" }}
[kubernetes_pod_name]:{{ index $alert.Labels "pod_name" }}
[container_name]:{{ index $alert.Labels "container_name" }}
[描述]:{{ index $alert.Annotations "description" }}
[报警阀值]:{{ index $alert.Annotations "value" }}
[开始时间]:{{ $alert.StartsAt }}
{{ end }}
{{ end }}