最新nagios监控搭建,小白也能看懂的文档!

nagios监控搭建

系统版本当前软件版本
Rocky Linux 9.1Nagios 4.4.14 / nrpe 4.1.0
IP地址服务端/客户端
10.3.3.200服务端
10.3.3.160客户端
10.3.3.162客户端

服务端搭建

yum install -y epel-release
rpm -ivh https://dl.fedoraproject.org/pub/epel/9/Everything/x86_64/Packages/e/epel-release-9-9.el9.noarch.rpm
yum install -y gcc glibc glibc-common gd gd-devel  openssl-devel perl postfix
yum install -y nagios  nagios-plugins-nt nagios-plugins-nrpe nagios-plugins-load nagios-plugins-disk nagios-plugins-swap nagios-plugins-procs nagios-plugins-users

systemctl start postfix
systemctl status postfix
systemctl enable postfix

# 创建nagiosadmin 密码
[root@k8s-master ~]# htpasswd -c /etc/nagios/passwd nagiosadmin
New password: devops@123
Re-type new password:  devops@123
Adding password for user nagiosadmin

修改配置

修改配置一:
cd  /etc/nagios
more cgi.cfg | grep -Ev '^$|^#'

main_config_file=/etc/nagios/nagios.cfg
physical_html_path=/usr/share/nagios/html
url_html_path=/nagios
show_context_help=0
use_pending_states=1
use_authentication=1
use_ssl_authentication=0
authorized_for_system_information=nagiosadmin
authorized_for_configuration_information=nagiosadmin
authorized_for_system_commands=nagiosadmin
authorized_for_all_services=nagiosadmin
authorized_for_all_hosts=nagiosadmin
authorized_for_all_service_commands=nagiosadmin
authorized_for_all_host_commands=nagiosadmin
default_statuswrl_layout=4
ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$
# 修改如下两项即可
refresh_rate=10
# 自动刷新:每 10 秒,系统会自动重新加载数据,显示最新的状态或结果
result_limit=200
# 限制了Nagios在展示状态信息、日志记录或其他相关查询结果时,最多可以显示200条记录

escape_html_tags=1
action_url_target=_blank
notes_url_target=_blank
lock_author_names=1
navbar_search_for_addresses=1
navbar_search_for_aliases=1
修改配置二:
more nagios.cfg | grep -Ev '^$|^#'
log_file=/var/log/nagios/nagios.log
cfg_file=/etc/nagios/objects/commands.cfg
cfg_file=/etc/nagios/objects/contacts.cfg
cfg_file=/etc/nagios/objects/timeperiods.cfg
cfg_file=/etc/nagios/objects/templates.cfg
# 新增如下4行,注释掉localhost.cfg 
# cfg_file=/etc/nagios/objects/localhost.cfg
cfg_file=/etc/nagios/objects/hosts.cfg
cfg_file=/etc/nagios/objects/hostsgroup.cfg
cfg_file=/etc/nagios/objects/service.cfg
cfg_file=/etc/nagios/objects/servicegroup.cfg

object_cache_file=/var/spool/nagios/objects.cache
precached_object_file=/var/spool/nagios/objects.precache
resource_file=/etc/nagios/private/resource.cfg
status_file=/var/spool/nagios/status.dat
status_update_interval=10
nagios_user=nagios
nagios_group=nagios
check_external_commands=1
command_file=/usr/local/nagios/var/rw/nagios.cmd
lock_file=/var/run/nagios/nagios.pid
temp_file=/usr/local/nagios/var/nagios.tmp
temp_path=/tmp
event_broker_options=-1
log_rotation_method=d
log_archive_path=/var/log/nagios/archives
use_syslog=1
log_notifications=1
log_service_retries=1
log_host_retries=1
log_event_handlers=1
log_initial_states=0
log_current_states=1
log_external_commands=1
log_passive_checks=1
service_inter_check_delay_method=s
max_service_check_spread=30
service_interleave_factor=s
host_inter_check_delay_method=s
max_host_check_spread=30
max_concurrent_checks=0
check_result_reaper_frequency=10
max_check_result_reaper_time=30
check_result_path=/var/spool/nagios/checkresults
max_check_result_file_age=3600
cached_host_check_horizon=15
cached_service_check_horizon=15
enable_predictive_host_dependency_checks=1
enable_predictive_service_dependency_checks=1
soft_state_dependencies=0
auto_reschedule_checks=0

auto_rescheduling_interval=10
# 每 10 秒钟,设置自动重新调度检查任务的时间间隔 ,当某个服务或主机在之前的检查中被标记为故障或不可用时,系统可以设置自动重新调度检查

auto_rescheduling_window=180
service_check_timeout=60
host_check_timeout=30
event_handler_timeout=30
notification_timeout=30
ocsp_timeout=5
ochp_timeout=5
perfdata_timeout=5
retain_state_information=1
state_retention_file=/var/spool/nagios/retention.dat

retention_update_interval=10
# 每 10 秒钟更新一次保留数据(在大规模的监控环境中。需要根据系统的性能和需求来调整合适的时间间隔。如果不希望过于频繁地更新数据,可以选择较长的时间间隔,如 60 秒、300 秒等)

use_retained_program_state=1
use_retained_scheduling_info=1
retained_host_attribute_mask=0
retained_service_attribute_mask=0
retained_process_host_attribute_mask=0
retained_process_service_attribute_mask=0
retained_contact_host_attribute_mask=0
retained_contact_service_attribute_mask=0

interval_length=30
# 用于设置监控任务执行的时间间隔,单位为秒

check_for_updates=1
bare_update_check=0
use_aggressive_host_checking=0
execute_service_checks=1
accept_passive_service_checks=1
execute_host_checks=1
accept_passive_host_checks=1

enable_notifications=1
# 启用通知功能。当监控到的服务或主机状态发生变化(例如从 OK 到 CRITICAL,或者 WARNING 等状态),系统会自动发送通知给预定的接收者

enable_event_handlers=1
process_performance_data=0
obsess_over_services=0
obsess_over_hosts=0
translate_passive_host_checks=0
passive_host_checks_are_soft=0
check_for_orphaned_services=1
check_for_orphaned_hosts=1
check_service_freshness=1

service_freshness_check_interval=10
# 每10分钟对服务进行新鲜度检查

service_check_timeout_state=c

check_host_freshness=1
# 启用主机新鲜度检查

host_freshness_check_interval=10
# 指定了主机新鲜度检查的时间间隔,单位是 分钟,如果在这个时间间隔内未能收到来自主机的最新状态更新(例如,PING 检查或其他心跳信号),系统会认为主机的状态可能已经过时或不可用。

additional_freshness_latency=15
enable_flap_detection=1
low_service_flap_threshold=5.0
high_service_flap_threshold=20.0
low_host_flap_threshold=5.0
high_host_flap_threshold=20.0
date_format=us
illegal_object_name_chars=`~!$%^&*|'"<>?,()=
illegal_macro_output_chars=`~$&|'"<>
use_regexp_matching=0
use_true_regexp_matching=0
admin_email=nagios@localhost
admin_pager=pagenagios@localhost
daemon_dumps_core=0
use_large_installation_tweaks=0
enable_environment_macros=0
debug_level=0
debug_verbosity=1
debug_file=/var/log/nagios/nagios.debug
max_debug_file_size=1000000
allow_empty_hostgroup_assignment=0

# 最好新增如下
process_performance_data=1
# 启用性能数据的处理功能

service_perfdata_file=/tmp/perfdata.log
# 指定了一个 性能数据文件 的路径,通常用于存储由 Nagios 生成的服务性能数据。

service_perfdata_file_template=$LASTSERVICECHECK$||$HOSTNAME$||$SERVICEDESC$||$SERVICEOUTPUT$||$SERVICEPERFDATA$
# 定义了性能数据文件中每条数据的格式。
# $LASTSERVICECHECK$:上次检查的时间戳。
# $HOSTNAME$:主机名。
# $SERVICEDESC$:服务的描述,例如 "HTTP" 或 "CPU"。
# $SERVICEOUTPUT$:服务的输出信息,通常是监控到的状态信息。
# $SERVICEPERFDATA$:服务的性能数据,通常是具体的性能指标(如 CPU 使用率、磁盘空间等)。
# 这些变量通过 || 分隔符连接,表示每条数据记录的不同字段。

service_perfdata_file_mode=a
# 设置了性能数据文件的 写入模式 ,a 表示 每次写入时,数据将被追加到文件的末尾 .w 表示会覆盖原有文件,每次生成新的性能数据时,文件内容都会被清空

service_perfdata_file_processing_interval=30
# 性能数据文件每 30 秒会被处理一次

修改配置三:
cd objects
# commands.cfg 专门用于定义各种监控命令,这些命令随后可以被Nagios的其他配置文件引用和执行。
vim commands.cfg
# 找到 check_http列 将其原来配置删除,再其下方添加,如下:

define command {
    command_name    check_http
    command_line    $USER1$/check_http -I $HOSTADDRESS$ -p $ARG1$ $ARG2$
}

define command {
    command_name    check_nrpe
    command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
define command {
    command_name    check_nrpe_arg
    command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -a $ARG2$
}

define command {
    command_name    check_local_mysql
    command_line    $USER1$/check_mysql -u $ARG1$ -p $ARG2$
}
define command {
    command_name    check_mysql
    command_line    $USER1$/check_mysql -H $HOSTADDRESS$ -u $ARG1$ -p $ARG2$
}

修改配置四:
vim templates.cfg
define host {
    name                            generic-printer      
    use                             generic-host         
    check_period                    24x7                 
    check_interval                  5                    
    retry_interval                  1                    
    max_check_attempts              10                   
    check_command                   check-host-alive     
    notification_period             workhours            
    notification_interval           30      # 每隔30分钟向指定的联系人或联系人组(在此配置中为 admins)发送一次通知             
    notification_options            d,r                  
    contact_groups                  admins               
    statusmap_image		printer.png
    register                        0                    
}

define service {

    name                            generic-service         ; The 'name' of this service template
    active_checks_enabled           1                       ; Active service checks are enabled
    passive_checks_enabled          1                       ; Passive service checks are enabled/accepted
    parallelize_check               1                       ; Active service checks should be parallelized (disabling this can lead to major performance problems)
    obsess_over_service             1                       ; We should obsess over this service (if necessary)
    check_freshness                 0                       ; Default is to NOT check service 'freshness'
    notifications_enabled           1                       ; Service notifications are enabled
    event_handler_enabled           1                       ; Service event handler is enabled
    flap_detection_enabled          1                       ; Flap detection is enabled
    process_perf_data               1                       ; Process performance data
    retain_status_information       1                       ; Retain status information across program restarts
    retain_nonstatus_information    1                       ; Retain non-status information across program restarts
    is_volatile                     0                       ; The service is not volatile
    check_period                    24x7                    ; The service can be checked at any time of the day
    max_check_attempts              3                       ; Re-check the service up to 3 times in order to determine its final (hard) state
    ;normal_check_interval           1						; 系统每隔多长时间检查一次某个服务或主机的状态,下方这个参数也是一样
    check_interval                  1                      ; 系统每隔多长时间检查一次某个服务或主机的状态
    retry_interval                  2                       ; Re-check the service every two minutes until a hard state can be determined
    contact_groups                  admins                  ; Notifications get sent out to everyone in the 'admins' group
    notification_options            w,u,c,r                 ; Send notifications about warning, unknown, critical, and recovery events
    notification_interval           60                      ; Re-notify about service problems every hour
    notification_period             24x7                    ; Notifications can be sent out at any time
    register                        0                       ; DON'T REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}

define service {

    name                            local-service           ; The name of this service template
    use                             generic-service         ; Inherit default values from the generic-service definition
    max_check_attempts              4                       ; Re-check the service up to 4 times in order to determine its final (hard) state
    ;normal_check_interval           1                       ; 系统每隔多长时间检查一次某个服务或主机的状态,下方这个参数也是一样
    check_interval                  1                       ; 系统每隔多长时间检查一次某个服务或主机的状态
    retry_interval                  1                       ; Re-check the service every minute until a hard state can be determined
    register                        0                       ; DON'T REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}


define service {

    name                           es-service               ;Elastic search service
    use                            generic-service          ;Inheric
    contact_groups                 es-admin                 ;es admin
    register                        0                       ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE!
}


# define host 代表的是主机模版
# define service 代表的是监控模版,监控模版就有两项,一个是 generic-service  一个是基于generic-service分割出来一部分local-service监控nagios本机
修改配置五:
# 先测试一下看看邮件是否可以发送成功
echo "how are you today" | mail -s "test" 1893130****@163.com

# 修改配置
more contacts.cfg  | grep -Ev '^$|^#'
# 发送邮件配置
define contact {
    contact_name            yuhailong             ; Short name of user
    use                     generic-contact         ; Inherit default values from generic-contact template (defined above)
    alias                   Yu Hai Long          ; Full name of user
    email                   1893130****@163.com ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
  ;  pager               # 如果有短信告警,这里可以选填手机号
}
; 多个以此类推
;例如如果有第二个人
define contact {
    contact_name            zhangsan            ; Short name of user
    use                     generic-contact     ; Inherit default values from generic-contact template (defined above)
    alias                   Zhang San           ; Full name of user
    email                   zhangsan@163.com    ; <<***** CHANGE THIS TO YOUR EMAIL ADDRESS ******
  ;  pager               # 如果有短信告警,这里可以选填手机号
}

define contactgroup {
    contactgroup_name       admins
    alias                   Nagios Administrators
    members                 yuhailong,zhangsan    ; 多个以逗号分隔,
}

define contactgroup {

    contactgroup_name       es-admin
    alias                   Elastic Search Administrators
    members                 yuhailong
}

###启动服务

# 检测nagios配置文件是否正确
nagios -v /etc/nagios/nagios.cfg
# 保证如下两项为0即可
Total Warnings: 0
Total Errors:   0

systemctl restart nagios
systemctl enable nagios

访问:http://10.3.3.200:8080/nagios/

配置监控项

配置需要监控的主机
vim /etc/nagios/objects/hosts.cfg
# 配置的是需要监控的主机。
# use 使用linux-server 模版,模版在templates.cfg定义的 linux-server
# 定义名称,自定义
# alias 定义别名,自定义
# address 监控主机的IP地址

define host {
    use                     linux-server   ; Name of host template to use 
    host_name               160
    alias                   160
    address                 192.168.3.160
}

define host {
    use                     linux-server   ; Name of host template to use
    host_name               200
    alias                   200
    address                 192.168.3.200
}

define host {
    use                     linux-server   ; Name of host template to use
    host_name               162
    alias                   162
    address                 192.168.3.162
}

配置需要监控的主机组

vim /etc/nagios/objects/hostsgroup.cfg
define hostgroup {
    hostgroup_name          nrpe-servers          ; The name of the hostgroup
    alias                   Servers with Nrpe     ; Long name of the group
    members                 *                     ; Comma separated list of hosts that belong to this group
}

# 定义组
# hostgroup_name 组名称
# alias 别名
# members  组成员  

# 注释
members   *  
# * 代表所有主机
members   *!160 # 表示组成员为所有,但是不包括160
# 在主机组配置中,! 用作排除操作符,用于从集合中排除特定元素。
# 在监控命令(servers)或参数中,! 用作分隔符,用于分隔命令的不同参数。
members   160
# 表示当前组里面只有160一个主机,
members   160,162
# 表示当前组里面有160和162两个主机,多个用逗号分隔
配置需要监控的主机的服务
vim /etc/nagios/objects/service.cfg 
define service {
    use                     generic-service           ; Name of service template to use
   ; host                   160
    hostgroup_name          nrpe-servers
    service_description     PING
    check_command           check_ping!100.0,20%!500.0,60%
    notifications_enabled   1
    servicegroups           ping-service      
}

define service {
    use                     generic-service           ; Name of service template to use
    hostgroup_name          nrpe-servers
    service_description     Root Partition
    check_command           check_nrpe_arg!check_disk!/
    notifications_enabled   1
    servicegroups           partition-free-space-services
}

define service {
    use                     generic-service           ; Name of service template to use
    hostgroup_name          nrpe-servers
    service_description     Var Partition
    check_command           check_nrpe_arg!check_disk!/var
    notifications_enabled   1
    servicegroups           partition-free-space-services
}

define service {
    use                     generic-service           ; Name of service template to use
    hostgroup_name          nrpe-servers
    service_description     data Partition
    check_command           check_nrpe_arg!check_disk!/data
    notifications_enabled   1
    servicegroups           partition-free-space-services
}

define service {
    use                     generic-service           ; Name of service template to use
    hostgroup_name          nrpe-servers
    service_description     Current Load
    check_command           check_nrpe!check_load
    servicegroups           checkload-service
    notifications_enabled   0
}


define service {
    use                     generic-service           ; Name of service template to use
    host_name               200
    service_description     jumpserver 8089
    check_command           check_http!8089! -u /
    notifications_enabled   1
    servicegroups           http-services
}


define service {
    use                     generic-service           ; Name of service template to use
    host_name               200
    service_description     zabbix 8080
    check_command           check_http!8080! -u /zabbix ;如果是外网域名的话 使用 -H 指定  如下 域名/IP 都可以
    notifications_enabled   1
    servicegroups           http-services
}

;define service {
;    use                     generic-service           ; Name of service template to use
;    ;hostgroup_name          http-www
;    host_name               200
;    service_description     Http www.okcis.cn
;    check_command           check_http!9000!-H www.okcis.cn -u /php/echo.php   ;-H 118.144.81.101 
;    notifications_enabled   1
;    servicegroups           http-services      
;}

define service {
    use                     generic-service           ; Name of service template to use
    host                    200
    service_description     Mysql
    check_command           check_mysql!root!devops@123
    notifications_enabled   1
    servicegroups           mysql-service      
}

define service {
    use                     generic-service           ; Name of service template to use
    host                    160
    service_description     Mysql
    check_command           check_mysql!root!dtzxroot@2002.DHW
    notifications_enabled   1
    servicegroups           mysql-service      
}

define service {
    use                     generic-service           ; Name of service template to use
    host_name               160
    service_description     Redis
    check_command           check_tcp!6379
    notifications_enabled   1
    servicegroups           redis-service
}

;-----------------------------------------------Local Service ---------------------------------------------------------
; 本机监控

define service {
    use                     local-service           ; Name of service template to use
    host_name               200
    service_description     Current Load
    check_command           check_local_load!50.0,40.0,30.0!80.0,60.0,50.0
    servicegroups           checkload-service
}

define service {
    use                     local-service           ; Name of service template to use
    host_name               200
    service_description     Data Partition
    check_command           check_local_disk!1%!1%!/data
    notifications_enabled   1
    servicegroups           partition-free-space-services
}


define service {
    use                     local-service           ; Name of service template to use
    host_name               200
    service_description     Var Partition
    check_command           check_local_disk!20%!10%!/var
    notifications_enabled   1
    servicegroups           partition-free-space-services
}

define service {
    use                     local-service           ; Name of service template to use
    host_name               200
    service_description     Root Partition
    check_command           check_local_disk!20%!10%!/
    notifications_enabled   1
    servicegroups           partition-free-space-services
}

# use local-service : 表示本机nagios使用,其余的都使用 generic-service

# 分号代表注释,host可以写单个主机或多个主机,多个主机以逗号分隔,
# hostgroup_name : 代表属于这个主机组的服务器都适用这个监控项,hostsgroup_name是在hostsgroup.cfg中定义的

# service_description : nagios页面展示的名称,对应上要监控的服务名称

# check_command : 监控服务 # check_local_disk 检查本地磁盘空间使用情况的插件,!20%!10%! 这部分是传递给check_local_disk命令的参数 / 这个是根的意思
# check_command :这条命令的意思是 监控 根目录,其中设置了警告和严重阈值分别为20%和10%。如果实际使用情况超过了这些阈值,监控系统将相应地发出警告

# notifications_enabled 发生问题是否告警,1是true 0是false

# servicegroups 定义一个server组,方便界面查询。
配置需要监控的主机的服务组
more /etc/nagios/objects/servicegroup.cfg 
define servicegroup {
    servicegroup_name       ping-service
    alias                   Ping Service
}

define servicegroup {
    servicegroup_name       http-services
    alias                   http Service
}

define servicegroup {
    servicegroup_name       partition-free-space-services
    alias                   Partition Free Space Services
}

define servicegroup {
    servicegroup_name       checkload-service
    alias                   Current Load Service
}

define servicegroup {
    servicegroup_name       mysql-service
    alias                   mysql Service
}

define servicegroup {
    servicegroup_name       redis-service
    alias                   redis Service
}

客户端搭建

两台客户端 :10.3.3.160  10.3.3.162

yum install -y epel-release
yum -y install nrpe

[root@localhost ~]# more /etc/nagios/nrpe.cfg | grep -Ev '^$|^#'
log_facility=daemon
debug=0
pid_file=/run/nrpe/nrpe.pid
server_port=5666
nrpe_user=nrpe
nrpe_group=nrpe
# 定义了哪些主机可以通过 NRPE 访问该服务器。
allowed_hosts=127.0.0.1,::1,192.168.3.0/24  
# NRPE 会允许远程主机执行一些可能涉及安全隐患的操作
dont_blame_nrpe=1  
allow_bash_command_substitution=0
command_timeout=60
connection_timeout=300
disable_syslog=0
command[check_users]=/usr/lib64/nagios/plugins/check_users -w 5 -c 10
command[check_load]=/usr/lib64/nagios/plugins/check_load -w 50,800,1000 -c 200,1000,1000
command[check_disk]=/usr/lib64/nagios/plugins/check_disk -w 20% -c 10% -p $ARG1$
command[check_swap]=/usr/lib64/nagios/plugins/check_swap -w 20% -c 10%
command[check_total_procs]=/usr/lib64/nagios/plugins/check_procs -w 150 -c 200
include_dir=/etc/nrpe.d/

systemctl  start  nrpe 
systemctl  enable  nrpe 
systemctl  status nrpe
ss -ntulp | grep 5666

yum install -y nagios-plugins-load nagios-plugins-disk nagios-plugins-swap nagios-plugins-procs nagios-plugins-users nagios-plugins-mysql   
ls /usr/lib64/nagios/plugins/
systemctl restart nrpe
systemctl enable nrpe

重启服务

# 检测nagios配置文件是否正确
nagios -v /etc/nagios/nagios.cfg
# 保证如下两项为0即可
Total Warnings: 0
Total Errors:   0

systemctl restart nagios

访问:http://10.3.3.200:8080/nagios/

请添加图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值