1 规划准备
均为centos7.9系统,内核版本为:3.10.0-1160.11.1.el7.x86_64
机器列表
master01 10.0.8.1
node01 10.0.8.2
服务基本部署在master01上,node01只作为被监控项,下述中如果没有指定主机,则默认在master01上部署
提前创建用户
sudo groupadd prometheus
sudo useradd -g prometheus -m -d /var/lib/prometheus -s /sbin/nologin prometheus
sudo groupadd alertmanager
sudo useradd -g alertmanager -m -d /var/lib/alertmanager -s /sbin/nologin alertmanager
sudo groupadd grafana
sudo useradd -g grafana -m -d /var/lib/grafana -s /sbin/nologin grafana
sudo groupadd consul
sudo useradd -g consul -m -d /var/lib/consul -s /sbin/nologin consul
2 下载安装包并解压
下载
#prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.25.2/prometheus-2.25.2.linux-amd64.tar.gz
#node_exporter
wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
#consul
wget https://releases.hashicorp.com/consul/1.10.1/consul_1.10.1_linux_amd64.zip
#alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.22.2/alertmanager-0.22.2.linux-amd64.tar.gz
#grafana
wget https://dl.grafana.com/oss/release/grafana-8.0.6.linux-amd64.tar.gz
#pushgateway
wget https://github.com/prometheus/pushgateway/releases/download/v1.4.1/pushgateway-1.4.1.linux-amd64.tar.gz
解压
#prometheus
tar xf prometheus-2.4.2.linux-amd64.tar.gz -C /opt/module
chown -R prometheus.prometheus /opt/module/prometheus-2.25.2.linux-amd64
#node_exporter
tar xf node_exporter-1.1.2.linux-amd64.tar.gz -C /opt/module
chown -R prometheus.prometheus /opt/module/node_exporter-1.1.2.linux-amd64
#consul
unzip consul_1.10.0_linux_amd64.zip -d /usr/local/bin
chown -R consul.consul /usr/local/bin/consul
#alertmanager
tar xf alertmanager-0.22.2.linux-amd64.tar.gz -C /opt/module/
chown -R alertmanager.alertmanager /opt/module/alertmanager-0.22.2.linux-amd64
#grafana
tar xf grafana-8.0.6.linux-amd64.tar.gz -C /opt/module/
chown -R grafana.grafana /opt/module/grafana-8.0.6.linux-amd64
#pushgateway
tar xf pushgateway-1.4.1.linux-amd64.tar.gz -C /opt/module/
chown -R prometheus.prometheus /opt/module/pushgateway-1.4.1.linux-amd64
3 修改配置文件
3.1 prometheus
cd /opt/module/prometheus-2.25.2.linux-amd64
#prometheus基于consul做服务发现,将targets的配置放在consul上,再通过consul_sd_configs去consul获取配置
mkdir yaml
cat <<EOF|tee /opt/module/prometheus-2.25.2.linux-amd64/yaml/prometheus.yml
#全局参数
global:
scrape_interval: 5s
evaluation_interval: 5s
#告警服务
alerting:
alertmanagers:
- consul_sd_configs:
- server: "10.0.8.1:8500"
tags:
- "alertmanager"
#告警内容
rule_files:
- "/opt/module/prometheus-2.25.2.linux-amd64/rule_yaml/*_rule.yml"
#监控任务
scrape_configs:
- job_name: 'prometheus'
consul_sd_configs:
- server: "10.0.8.1:8500"
tags:
- "prometheus"
refresh_interval: 10m
- job_name: 'node_exporter'
consul_sd_configs:
- server: "10.0.8.1:8500"
tags:
- "node_exporter"
refresh_interval: 10m
- job_name: 'alertmanager'
consul_sd_configs:
- server: "10.0.8.1:8500"
tags:
- "alertmanager"
refresh_interval: 10m
- job_name: 'pushgateway'
consul_sd_configs:
- server: "10.0.8.1:8500"
tags:
- "pushgateway"
refresh_interval: 10m
EOF
#配置prometheus告警内容
mkdir rule_yaml
cat <<EOF|tee /opt/module/prometheus-2.25.2.linux-amd64/rule_yaml/up_rule.yml
groups:
- name: node_exporter
rules:
- alert: InstanceDown
expr: up == 0
for: 10s
annotations:
title: 'Instance down'
description: 'Instance {{ \$labels.instance }} has been down for more than 10 seconds!'
summary: 'View links: http://监控链接xxxx'
labels:
severity: 'critical'
EOF
cat <<EOF|tee /opt/module/prometheus-2.25.2.linux-amd64/rule_yaml/cpu_rule.yml
groups:
- name: node_exporter
rules:
- alert: InstanceCpuMoreThan40
expr: (1 - sum(increase(node_cpu_seconds_total{mode="idle"}[10s])) by (instance)/sum(increase(node_cpu_seconds_total[10s])) by (instance)) * 100 > 40
for: 30s
annotations:
title: 'Instance Cpu Usage'
description: 'The Cpu Usage of Instance: {{ \$labels.instance }} More Than 40%, Now it has reached: {{ \$value }} !'
summary: 'View links: http://监控链接xxxx'
labels:
severity: 'critical'
EOF
cat <<EOF|tee /opt/module/prometheus-2.25.2.linux-amd64/rule_yaml/mem_rule.yml
groups:
- name: node_exporter
rules:
- alert: InstanceMemMoreThan40
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 40
for: 30s
annotations:
title: "Instance Men Usage"
description: "The Men Usage of Instance: {{ \$labels.instance }} More Than 50%, Now it has reached: {{ \$value }} !"
summary: "View links: http://监控链接xxxx'
labels:
severity: 'critical'
EOF
#prometheus启动文件
cat <<EOF|tee /usr/lib/systemd/system/prometheus.service
[Unit]
Description=prometheus
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/module/prometheus-2.25.2.linux-amd64/prometheus --config.file=/opt/module/prometheus-2.25.2.linux-amd64/yaml/prometheus.yml --storage.tsdb.path=/opt/module/prometheus-2.25.2.linux-amd64/data --storage.tsdb.retention=30d --web.enable-lifecycle --log.level=debug
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
3.2 node_exporter
master01和node01上均安装,注意node01上要useradd prometheus
cat <<EOF|tee /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
Documentation=https://github.com/prometheus/node_exporter
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/module/node_exporter-1.1.2.linux-amd64/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
3.3 consul
mkdir -p /consul/data
mkdir /etc/consul
chown -R consul.consul /consul/data
chown -R consul.consul /etc/consul
#服务启动文件
cat <<EOF|tee /usr/lib/systemd/system/consul.service
[Unit]
Description=consul
Documentation=https://www.consul.io/downloads
After=network.target
[Service]
Type=simple
User=consul
Group=consul
ExecStart=/usr/local/bin/consul agent -dev -ui -data-dir=/consul/data -config-dir=/etc/consul -client=0.0.0.0
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
#服务发现文件,对应prometheus.yml里面的consul_sd_configs
cat <<EOF|tee node_exporter.json
{
"services": [
{
"id": "node_exporter-master01",
"name": "node_exporter-master01",
"address": "10.0.8.1",
"port": 9100,
"tags": ["node_exporter"],
"checks": [{
"http": "http://10.0.8.1:9100/metrics",
"interval": "5s"
}]
},
{
"id": "node_exporter-node01",
"name": "node_exporter-node01",
"address": "10.0.8.2",
"port": 9100,
"tags": ["node_exporter"],
"checks": [{
"http": "http://10.0.8.2:9100/metrics",
"interval": "5s"
}]
}
]
}
EOF
cat <<EOF|tee alertmanager.json
{
"services": [
{
"id": "alertmanager-server01",
"name": "alertmanager-server01",
"address": "10.0.8.1",
"port": 9093,
"tags": ["alertmanager"],
"checks": [{
"http": "http://10.0.8.1:9093/#/alerts",
"interval": "5s"
}]
}
]
}
EOF
cat <<EOF|tee prometheus-servers.json
{
"services": [
{
"id": "prometheus-master01",
"name": "prometheus-master01",
"address": "10.0.8.1",
"port": 9090,
"tags": ["prometheus"],
"checks": [{
"http": "http://10.0.8.1:9090/metrics",
"interval": "5s"
}]
}
]
}
EOF
cat <<EOF|tee pushgateway.json
{
"services": [
{
"id": "pushgateway-master01",
"name": "pushgateway-master01",
"address": "10.0.8.1",
"port": 9091,
"tags": ["pushgateway"],
"checks": [{
"http": "http://10.0.8.1:9091/metrics",
"interval": "5s"
}]
}
]
}
EOF
3.4 alertmanager
cat<<EOF|tee /opt/module/alertmanager-0.22.2.linux-amd64/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['node_exporter']
group_wait: 1m
group_interval: 1m
repeat_interval: 5m
receiver: 'mail-me'
receivers:
- name: 'mail-me'
email_configs:
- to: '123456@163.com'
from: '123456@163.com'
smarthost: 'smtp.163.com:25'
auth_username: '123456@163.com'
auth_identity: '123456@163.com'
auth_password: '******'
require_tls: false
inhibit_rules:
- source_match:
alertname: InstanceDown
severity: critical
target_match:
alertname: InstanceDown
severity: critical
equal:
- instance
- source_match:
alertname: InstanceCpuMoreThan40
severity: critical
target_match:
alertname: InstanceCpuMoreThan40
severity: critical
equal:
- instance
EOF
cat <<EOF|tee /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
Documentation=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=alertmanager
Group=alertmanager
ExecStart=/opt/module/alertmanager-0.22.2.linux-amd64/alertmanager --config.file=/opt/module/alertmanager-0.22.2.linux-amd64/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
3.5 grafana
cat <<EOF|tee /etc/sysconfig/grafana-server
GRAFANA_USER=grafana
GRAFANA_GROUP=grafana
GRAFANA_HOME=/opt/module/grafana-8.0.6
LOG_DIR=/opt/module/grafana-8.0.6/data/log
DATA_DIR=/opt/module/grafana-8.0.6/data
MAX_OPEN_FILES=10000
CONF_DIR=/opt/module/grafana-8.0.6/conf
CONF_FILE=/opt/module/grafana-8.0.6/conf/defaults.ini
RESTART_ON_UPGRADE=true
PLUGINS_DIR=/opt/module/grafana-8.0.6/data/plugins
PROVISIONING_CFG_DIR=/etc/grafana/provisioning
# Only used on systemd systems
PID_FILE_DIR=/var/run/grafana
EOF
chown -R grafana.grafana /etc/sysconfig/grafana-server
cat <<EOF|tee /usr/lib/systemd/system/grafana.service
[Unit]
Description=Grafana instance
Documentation=http://docs.grafana.org
Wants=network-online.target
After=network-online.target
After=postgresql.service mariadb.service mysqld.service prometheus.service
[Service]
EnvironmentFile=/etc/sysconfig/grafana-server
User=grafana
Group=grafana
Type=notify
Restart=on-failure
WorkingDirectory=/opt/module/grafana-8.0.6
RuntimeDirectory=grafana
RuntimeDirectoryMode=0750
ExecStart=/opt/module/grafana-8.0.6/bin/grafana-server \
--config=\${CONF_FILE} \
--pidfile=\${PID_FILE_DIR}/grafana-server.pid \
cfg:default.paths.logs=\${LOG_DIR} \
cfg:default.paths.data=\${DATA_DIR} \
cfg:default.paths.plugins=\${PLUGINS_DIR} \
cfg:default.paths.provisioning=\${PROVISIONING_CFG_DIR}
LimitNOFILE=10000
TimeoutStopSec=20
[Install]
WantedBy=multi-user.target
EOF
3.6 pushgateway
mkdir /opt/module/pushgateway-1.4.1.linux-amd64/{bin,data}
mv /opt/module/pushgateway-1.4.1.linux-amd64/pushgateway /opt/module/pushgateway-1.4.1.linux-amd64/bin
cat <<EOF|tee /usr/lib/systemd/system/pushgateway.service
[Unit]
Description=pushgateway
Documentation=https://github.com/prometheus/pushgateway
After=network.target
[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/module/pushgateway-1.4.1.linux-amd64/bin/pushgateway --persistence.file=/opt/module/pushgateway-1.4.1.linux-amd64/data/pushgateway.data
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
4 启动与验证
systemctl enable prometheus.service
systemctl enable node_exporter.service
systemctl enable consul.service
systemctl enable alertmanager.service
systemctl enable grafana.service
systemctl enable pushgateway.service
systemctl start prometheus.service
systemctl start node_exporter.service
systemctl start consul.service
systemctl start alertmanager.service
systemctl start grafana.service
systemctl start pushgateway.service
ps aux|grep -i -E "prometheus|node_exporter|consul|alertmanager|grafana|pushgateway"
ss -ntl|grep -i -E "9090|9100|8500|9093|3000|9091"
打开链接检查网页是否正常:
#prometheus
http://10.0.8.1:9090/targets
#grafana
http://10.0.8.1:3000/
#consul
http://10.0.8.1:8500/ui/
#alert
http://10.0.8.1:9093/#/alerts
至此,所有服务部署完毕,grafana配置详见上一节:Prometheus学习(一):单节点部署与配置+Grafana安装