继续接着nagios昨天的实验。
一、配置监控oracle服务器
1.安装nrpe(略)
下载DBI-1.617.tar.gz,DBD-Oracle-1.38.tar.gz,check_oracle_health-1.6.3.tar.gz。
http://search.cpan.org/CPAN/authors/id/T/TI/TIMB/DBI-1.609.tar.gz
http://mirrors.neusoft.edu.cn/cpan/authors/id/P/PY/PYTHIAN/DBD-Oracle-1.52.tar.gz
http://labs.consol.de/wp-content/uploads/2009/09/check_oracle_health-1.6.3.tar.gz
2.安装DBI
# tar -xvf /soft/DBI-1.617.tar.gz -C /usr/src/
# cd /usr/src/DBI-1.617/
# perl Makefile.PL
# make all && make install
3.安装DBD
# tar -xvf /soft/DBD-Oracle-1.38.tar.gz -C /usr/src
# cd /usr/src/DBD-Oracle-1.38/
要在shell临时设置一下ORACLE_HOME,否则编译通不过
# export ORACLE_HOME=/opt/app/oracle/product/11.2.0/db_1
# perl Makefile.PL
# make && make install
4.安装check_oracle_health命令
# tar -xvf check_oracle_health-1.6.3.tar.gz -C /usr/src
# cd /usr/src/check_oracle_health-1.6.3/
# ./configure --prefix=/usr/local/nagios --with-nagios-user=nagios --with-nagios-group=nagios --with-mymodules-dir=/usr/local/nagios/libexec --with-mymodules-dyndir=/usr/local/nagios/libexec
# make all && make install
安装完后可在libexec目录下看到这个命令了
# ll /usr/local/nagios/libexec/
total 276
-rwxrwxr-x 1 nagios nagios 75340 Aug 4 21:02 check_nrpe
-rwxr-xr-x 1 root root 184721 Aug 4 23:26 check_oracle_health
在本机测试命令是否可用
# /usr/local/nagios/libexec/check_oracle_health --connect=dbtest --user=calvin --password=calvin --mode=tnsping
CRITICAL - cannot connect to dbtest. install_driver(Oracle) failed: Can't load '/usr/local/lib64/perl5/auto/DBD/Oracle/Oracle.so' for module DBD::Oracle: libclntsh.so.11.1: cannot open shared object file: No such file or directory at /usr/lib64/perl5/DynaLoader.pm line 200.
at (eval 18) line 3
Compilation failed in require at (eval 18) line 3.
Perhaps a required shared library or dll isn't installed where expected
at /usr/local/nagios/libexec/check_oracle_health line 4098
看到如上报错,执行如下命令:
# echo $ORACLE_HOME/lib >> /etc/ld.so.conf
# ldconfig
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefdms.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmevc.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefos.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmevq.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmadm.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmadbg.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefw.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmeoci.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmasf.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmastk.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmalk.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefud.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmarl.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefsqlt.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefvr.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefut.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefpfa.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmevsp.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefport.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmemso.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefsql.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmcfsga.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefojmx.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmefsp.so.0 is empty, not checked.
ldconfig: File /opt/app/oracle/product/11.2.0/db_1/lib/libnmcfhc.so.0 is empty, not checked.
ldconfig: /opt/app/oracle/product/11.2.0/db_1/lib/libexpat.so.1 is not a symbolic link
ldconfig: File /usr/lib64/libbfd.so is empty, not checked.
ldconfig: File /usr/lib64/libopcodes.so is empty, not checked.
再执行命令看是否成功,这里的connect指的是本地服务命名
# /usr/local/nagios/libexec/check_oracle_health --connect=dbtest --user=calvin --password=calvin --mode=tnsping
OK - connection established to dbtest.
# /usr/local/nagios/libexec/check_oracle_health --connect=dbtest --user=calvin --password=calvin --mode=tablespace-usage
OK - tbs USERS usage is 0.01%, tbs UNDOTBS1 usage is 0.04%, tbs TEMP usage is 0.00%, tbs SYSTEM usage is 2.15%, tbs SYSAUX usage is 1.61% | 'tbs_users_usage_pct'=0.01%;90;98 'tbs_users_usage'=4MB;29491;32112;0;32767 'tbs_users_alloc'=5MB;;;0;32767 'tbs_undotbs1_usage_pct'=0.04%;90;98 'tbs_undotbs1_usage'=12MB;29491;32112;0;32767 'tbs_undotbs1_alloc'=730MB;;;0;32767 'tbs_temp_usage_pct'=0.00%;90;98 'tbs_temp_usage'=0MB;29491;32112;0;32767 'tbs_temp_alloc'=60MB;;;0;32767 'tbs_system_usage_pct'=2.15%;90;98 'tbs_system_usage'=703MB;29491;32112;0;32767 'tbs_system_alloc'=710MB;;;0;32767 'tbs_sysaux_usage_pct'=1.61%;90;98 'tbs_sysaux_usage'=526MB;29491;32112;0;32767 'tbs_sysaux_alloc'=600MB;;;0;32767
5.在root和nagios用户下.bash_profile添加
ORACLE_SID=dbstat
ORACLE_BASE=/opt/app/oracle
ORACLE_HOME=$ORACLE_BASE/product/11.2.0/db_1
export ORACLE_SID ORACLE_BASE ORACLE_HOME
PATH=$PATH:/$ORACLE_HOME/bin:$HOME/bin
export PATH
6.把nagios加到oinstall组
# usermod -G oinstall nagios
# id nagios
uid=502(nagios) gid=503(nagios) groups=503(nagios),501(oinstall)
7.设置nrpe.cfg文件
# vi /usr/local/nagios/etc/nrpe.cfg
####添加监控服务器ip
allowed_hosts=127.0.0.1,192.168.217.131
####要监控的命令
##oracledb check
command[db-current-sessions]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode sql --name 'select count(1) from v$session' --name2 current_sessions --warning 800 --critical 1000
command[db-tnsping]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=tnsping
command[db-connection-time]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=connection-time
command[db-connected-users]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=connected-users --warning 900 --critical 1000
command[db-sga-data-buffer-hit-ratio]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=sga-data-buffer-hit-ratio
command[db-sga-shared-pool-free]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=sga-shared-pool-free
command[db-pga-in-memory-sort-ratio]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=pga-in-memory-sort-ratio
command[db-tablespace-wz]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --tablespace=wz --mode=tablespace-usage
command[db-tablespace-free]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=tablespace-free
command[db-datafile-io-traffic]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=datafile-io-traffic
command[db-switch-interval]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=switch-interval
command[db-roll-avgactivesize]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=roll-avgactivesize
command[db-alertlog-abnormal]=/usr/local/nagios/libexec/check_oracle_alertlog
command[db-top10-logical-reads-obj]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=seg-top10-logical-reads --warning 100 --critical 200
command[db-top10-physical-reads-obj]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=seg-top10-physical-reads --warning 100 --critical 200
command[db-event-waiting]=/usr/local/nagios/libexec/check_oracle_health --connect=tnsdbstat --user=calvin --password=calvin --mode=event-waiting --warning 7000 --critical 8000
这里有个命令是自定义的,check_oracle_alertlog,是摘抄了别人写得perl脚本:
#!/usr/bin/perl -w
use strict;
use File::Tail;
use Data::Dumper;
use Date::Manip;
my $oracle_log = "/bea/oracle/admin/oradb10/bdump/alert_oradb10.log";
my $limit = 1000;
my $now = localtime;
$now = ParseDate($now);
$now =~ s/://g;
my $fh = File::Tail->new( name => $oracle_log, tail => $limit );
if ( !defined $fh ) {
die "Could not create File::Tail object on $oracle_log: $!\n";
}
$fh->nowait(1);
my %errors;
#local $" = "";
my $count = 0;
my $start = 0;
while ( defined( my $line = $fh->read() ) ) {
last unless $line;
chomp($line);
#print $count++ . ": " . $line . "\n";
$count++;
# Wed Jan 24 20:41:40 2007
# ORA-00313: open failed for members of log group 1 of thread 1
if ( $line =~ m{(\w+\s+\d+\s+\d{2}:\d{2}:\d{2}\s+\d{4})} ) {
my $time = ParseDate($1);
$time =~ s/://g;
# 20080804111059
if ( $time > ( $now - 10000 ) ) {
$start = 1;
}
}
if ( ( $start == 1 ) && ( $line =~ m{(ORA-\d+)[:]?\s+(.+)} ) ) {
#next if ( $line =~ m{ORA-00942} ); #skip some special errors
#next if ( $line =~ m{ORA-07445} );
$errors{$1} = $2;
}
}
if ( scalar keys %errors ) {
foreach my $tmp ( keys %errors ) {
print $tmp .": " . $errors{$tmp} . "\n";
}
exit 2;
} else {
print "Oracle alertlog is ok\n";
exit 0;
}
修改权限
# chown nagios:nagios /usr/local/nagios/libexec/check_oracle_alertlog
启动nrpe
# /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
二、监控端设置:
1.检查监控服务器调用命令是否成功
# /usr/local/nagios/libexec/check_nrpe -H 192.168.217.129 -c check_oracle_health
OK - tbs USERS usage is 0.01%
tbs UNDOTBS1 usage is 0.03%
tbs TEMP usage is 0.00%
tbs SYSTEM usage is 2.15%
tbs SYSAUX usage is 1.61% | 'tbs_users_usage_pct'=0.01%;90;98
'tbs_users_usage'=4MB;29491;32112;0;32767
'tbs_users_alloc'=5MB;;;0;32767
'tbs_undotbs1_usage_pct'=0.03%;90;98
'tbs_undotbs1_usage'=10MB;29491;32112;0;32767
'tbs_undotbs1_alloc'=730MB;;;0;32767
'tbs_temp_usage_pct'=0.00%;90;98
'tbs_temp_usage'=0MB;29491;32112;0;32767
'tbs_temp_alloc'=60MB;;;0;32767
'tbs_system_usage_pct'=2.15%;90;98
'tbs_system_usage'=703MB;29491;32112;0;32767
'tbs_system_alloc'=710MB;;;0;32767
'tbs_sysaux_usage_pct'=1.61%;90;98
'tbs_sysaux_usage'=526MB;29491;32112;0;32767
'tbs_sysaux_alloc'=600MB;;;0;32767
2.建立数据库监控服务配置文件目录,方便管理
# mkdir /usr/local/nagios/etc/databases
新建监控服务配置文件
# vi /usr/local/nagios/etc/databases/oracle.cfg
define host{
use linux-server
host_name dbtest.sink.com
alias oracle_db
address 192.168.217.129
process_perf_data 1
action_url /nagios/pnp/index.php?host=$HOSTNAME$
}
define service{
host_name dbtest.sink.com
service_description db-alertlog-abnormal
check_command check_nrpe!db-alertlog-abnormal
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-current-sessions
check_command check_nrpe!db-current-sessions
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-tnsping
check_command check_nrpe!db-tnsping
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-connection-time
check_command check_nrpe!db-connection-time
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-connected-users
check_command check_nrpe!db-connected-users
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-sga-data-buffer-hit-ratio
check_command check_nrpe!db-sga-data-buffer-hit-ratio
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-sga-shared-pool-free
check_command check_nrpe!db-sga-shared-pool-free
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-pga-in-memory-sort-ratio
check_command check_nrpe!db-pga-in-memory-sort-ratio
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-tablespace-wz
check_command check_nrpe!db-tablespace-wz
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-tablespace-free
check_command check_nrpe!db-tablespace-free
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-datafile-io-traffic
check_command check_nrpe!db-datafile-io-traffic
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-switch-interval
check_command check_nrpe!db-switch-interval
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-roll-avgactivesize
check_command check_nrpe!db-roll-avgactivesize
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-event-waiting
check_command check_nrpe!db-event-waiting
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-top10-logical-reads-obj
check_command check_nrpe!db-top10-logical-reads-obj
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
define service{
host_name dbtest.sink.com
service_description db-top10-physical-reads-obj
check_command check_nrpe!db-top10-physical-reads-obj
max_check_attempts 5
normal_check_interval 3
retry_check_interval 2
check_period 24x7
notification_interval 10
notification_period 24x7
notification_options w,u,c,r
}
3.在nagios.cfg配置文件加上配置
# vi /usr/local/nagios/etc/nagios.cfg
cfg_file=/usr/local/nagios/etc/databases/oracle.cfg
4.重启nagios,在监控页面就可以看到oracle的服务了
参考:
http://blog.chinaunix.net/uid-23916356-id-3427084.html
http://aviar.blog.51cto.com/361632/124444