ganglia
ganglia安装
需要安装的ganglia包:
yum install ganglia
yum install ganglia-gmond
yum install ganglia-gmond-python
ganglia的配置文件:
/etc/ganglia/gmond.conf
/etc/ganglia/conf.d/
是放插件的配置文件的位置
/usr/lib64/ganglia/python_modules/
是放python格式的插件脚本的位置
ganglia的启动:
service gmond restart
gmond.conf
gmond分master和slave两个角色。
* master,用于接收各个节点的数据,且提供tcp的对外服务,让其他应用可以从它这里获取到整个集群的监控信息。
* slave,用于采集当前机器的数据,然后通过udp的方式,上报给master。
这两个角色的配置文件,也不大一样。
master
gmond.conf配置文件参见附录一
slave
gmond.conf配置文件参见附录二
插件
可以自行写ganglia插件应用。
插件配置
/etc/ganglia/conf.d
目录下放socket_total.pyconf
modules {
module {
name = "socket_total"
language = "python"
}
}
collection_group {
collect_every = 30
time_threshold = 50
metric {
name = "socket_total"
title = "Socket_Total"
value_threshold = 100
}
}
插件内容
/usr/lib64/ganglia/python_modules/
目录下放socket_total.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import subprocess
import time
import logging
import traceback
import os
def socket_handler(name):
handle = subprocess.Popen("lsof -i -n | wc -l", stdout = subprocess.PIPE, shell = True)
output = handle.communicate()[0]
return float(output)
#define metric_init function
def metric_init(params):
global descriptors
dict = {
'name': 'socket_total',
'call_back': socket_handler,
'time_max': 30,
'value_type': 'float',
'units': 'num',
'slope': 'both',
'format': '%f',
'description': 'Number of sockets connetions',
'groups': 'network'
}
descriptors = [dict]
return descriptors
#define metric_claunup function
def metric_cleanup():
'''Clean up the metric module.'''
#pass
#This code is for debugging and unit testing
if __name__ == '__main__':
metric_init({})
for d in descriptors:
v = d['call_back'](d['name'])
print 'value for %s is %s' % (d['name'], v)
ganglia安装后的测试
# telnet gmond_core_ip gmond_server_port
telnet 1.2.3.4 1234
如果获取到一堆xml格式数据,且数据里面能够查到所监控的各节点的ip地址,则安装、运行成功。
附录一
/* This configuration is as close to 2.5.x default behavior as possible
The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
daemonize = yes
setuid = yes
user = root
debug_level = 0
max_udp_msg_len = 1472
mute = no
deaf = no
allow_extra_data = yes
host_dmax = 86400 /*secs. Expires (removes from web interface) hosts in 1 day */
host_tmax = 20 /*secs */
cleanup_threshold = 300 /*secs */
gexec = no
# By default gmond will use reverse DNS resolution when displaying your hostname
# Uncommeting following value will override that value.
# override_hostname = "mywebserver.domain.com"
# If you are not using multicast this value should be set to something other than 0.
# Otherwise if you restart aggregator gmond you will get empty graphs. 60 seconds is reasonable
send_metadata_interval = 25 /*secs */
}
/*
* The cluster attributes specified will be used as part of the <CLUSTER>
* tag that will wrap all hosts collected by this instance.
*/
cluster {
name = "dtdream"
owner = "dtdream"
latlong = "dtdream"
url = "dtdream"
}
/* The host section describes attributes of the host, like the location */
host {
location = "dtdream"
}
/* Feel free to specify as many udp_send_channels as you like. Gmond
used to only support having a single channel */
udp_send_channel {
host = 192.168.137.82
port = 61094
ttl = 1
}
/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
port = 61094
}
/* You can specify as many tcp_accept_channels as you like to share
an xml description of the state of the cluster */
tcp_accept_channel {
port = 61094
# If you want to gzip XML output
gzip_output = no
}
/* Channel to receive sFlow datagrams */
#udp_recv_channel {
# port = 61094
#}
/* Optional sFlow settings */
#sflow {
# udp_port = 61094
# accept_vm_metrics = yes
# accept_jvm_metrics = yes
# multiple_jvm_instances = no
# accept_http_metrics = yes
# multiple_http_instances = no
# accept_memcache_metrics = yes
# multiple_memcache_instances = no
#}
/* Each metrics module that is referenced by gmond must be specified and
loaded. If the module has been statically linked with gmond, it does
not require a load path. However all dynamically loadable modules must
include a load path. */
modules {
module {
name = "core_metrics"
}
module {
name = "cpu_module"
path = "modcpu.so"
}
module {
name = "disk_module"
path = "moddisk.so"
}
module {
name = "load_module"
path = "modload.so"
}
module {
name = "mem_module"
path = "modmem.so"
}
module {
name = "net_module"
path = "modnet.so"
}
module {
name = "proc_module"
path = "modproc.so"
}
module {
name = "sys_module"
path = "modsys.so"
}
}
/* The old internal 2.5.x metric array has been replaced by the following
collection_group directives. What follows is the default behavior for
collecting and sending metrics that is as close to 2.5.x behavior as
possible. */
/* This collection group will cause a heartbeat (or beacon) to be sent every
20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
the age of the running gmond. */
collection_group {
collect_once = yes
time_threshold = 20
metric {
name = "heartbeat"
}
}
/* This collection group will send general info about this host*/
collection_group {
collect_every = 60
time_threshold = 60
metric {
name = "cpu_num"
title = "CPU Count"
}
metric {
name = "cpu_speed"
title = "CPU Speed"
}
metric {
name = "mem_total"
title = "Memory Total"
}
metric {
name = "swap_total"
title = "Swap Space Total"
}
metric {
name = "boottime"
title = "Last Boot Time"
}
metric {
name = "machine_type"
title = "Machine Type"
}
metric {
name = "os_name"
title = "Operating System"
}
metric {
name = "os_release"
title = "Operating System Release"
}
metric {
name = "location"
title = "Location"
}
}
/* This collection group will send the status of gexecd for this host
every 300 secs.*/
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
collection_group {
collect_once = yes
time_threshold = 300
metric {
name = "gexec"
title = "Gexec Status"
}
}
/* This collection group will collect the CPU status info every 20 secs.
The time threshold is set to 90 seconds. In honesty, this
time_threshold could be set significantly higher to reduce
unneccessary network chatter. */
collection_group {
collect_every = 20
time_threshold = 90
/* CPU status */
metric {
name = "cpu_user"
value_threshold = "1.0"
title = "CPU User"
}
metric {
name = "cpu_system"
value_threshold = "1.0"
title = "CPU System"
}
metric {
name = "cpu_idle"
value_threshold = "5.0"
title = "CPU Idle"
}
metric {
name = "cpu_nice"
value_threshold = "1.0"
title = "CPU Nice"
}
metric {
name = "cpu_aidle"
value_threshold = "5.0"
title = "CPU aidle"
}
metric {
name = "cpu_wio"
value_threshold = "1.0"
title = "CPU wio"
}
metric {
name = "cpu_steal"
value_threshold = "1.0"
title = "CPU steal"
}
/* The next two metrics are optional if you want more detail...
... since they are accounted for in cpu_system.
metric {
name = "cpu_intr"
value_threshold = "1.0"
title = "CPU intr"
}
metric {
name = "cpu_sintr"
value_threshold = "1.0"
title = "CPU sintr"
}
*/
}
collection_group {
collect_every = 20
time_threshold = 90
/* Load Averages */
metric {
name = "load_one"
value_threshold = "1.0"
title = "One Minute Load Average"
}
metric {
name = "load_five"
value_threshold = "1.0"
title = "Five Minute Load Average"
}
metric {
name = "load_fifteen"
value_threshold = "1.0"
title = "Fifteen Minute Load Average"
}
}
/* This group collects the number of running and total processes */
collection_group {
collect_every = 80
time_threshold = 950
metric {
name = "proc_run"
value_threshold = "1.0"
title = "Total Running Processes"
}
metric {
name = "proc_total"
value_threshold = "1.0"
title = "Total Processes"
}
}
/* This collection group grabs the volatile memory metrics every 40 secs and
sends them at least every 180 secs. This time_threshold can be increased
significantly to reduce unneeded network traffic. */
collection_group {
collect_every = 40
time_threshold = 180
metric {
name = "mem_free"
value_threshold = "1024.0"
title = "Free Memory"
}
metric {
name = "mem_shared"
value_threshold = "1024.0"
title = "Shared Memory"
}
metric {
name = "mem_buffers"
value_threshold = "1024.0"
title = "Memory Buffers"
}
metric {
name = "mem_cached"
value_threshold = "1024.0"
title = "Cached Memory"
}
metric {
name = "swap_free"
value_threshold = "1024.0"
title = "Free Swap Space"
}
}
collection_group {
collect_every = 40
time_threshold = 300
metric {
name = "bytes_out"
value_threshold = 4096
title = "Bytes Sent"
}
metric {
name = "bytes_in"
value_threshold = 4096
title = "Bytes Received"
}
metric {
name = "pkts_in"
value_threshold = 256
title = "Packets Received"
}
metric {
name = "pkts_out"
value_threshold = 256
title = "Packets Sent"
}
}
/* Different than 2.5.x default since the old config made no sense */
collection_group {
collect_every = 1800
time_threshold = 3600
metric {
name = "disk_total"
value_threshold = 1.0
title = "Total Disk Space"
}
}
collection_group {
collect_every = 40
time_threshold = 180
metric {
name = "disk_free"
value_threshold = 1.0
title = "Disk Space Available"
}
metric {
name = "part_max_used"
value_threshold = 1.0
title = "Maximum Disk Space Used"
}
}
include ("/etc/ganglia/conf.d/*.conf")
附录二
/* This configuration is as close to 2.5.x default behavior as possible
The values closely match ./gmond/metric.h definitions in 2.5.x */
globals {
daemonize = yes
setuid = yes
user = root
debug_level = 0
max_udp_msg_len = 1472
mute = no
deaf = yes
allow_extra_data = yes
host_dmax = 86400 /*secs. Expires (removes from web interface) hosts in 1 day */
host_tmax = 20 /*secs */
cleanup_threshold = 300 /*secs */
gexec = no
# By default gmond will use reverse DNS resolution when displaying your hostname
# Uncommeting following value will override that value.
# override_hostname = "mywebserver.domain.com"
# If you are not using multicast this value should be set to something other than 0.
# Otherwise if you restart aggregator gmond you will get empty graphs. 60 seconds is reasonable
send_metadata_interval = 25 /*secs */
}
/*
* The cluster attributes specified will be used as part of the <CLUSTER>
* tag that will wrap all hosts collected by this instance.
*/
cluster {
name = "dtdream"
owner = "dtdream"
latlong = "dtdream"
url = "dtdream"
}
/* The host section describes attributes of the host, like the location */
host {
location = "dtdream"
}
/* Feel free to specify as many udp_send_channels as you like. Gmond
used to only support having a single channel */
udp_send_channel {
host = 192.168.137.82
port = 61094
ttl = 1
}
/* You can specify as many udp_recv_channels as you like as well. */
udp_recv_channel {
port = 61094
}
/* You can specify as many tcp_accept_channels as you like to share
an xml description of the state of the cluster */
tcp_accept_channel {
port = 61094
# If you want to gzip XML output
gzip_output = no
}
/* Channel to receive sFlow datagrams */
#udp_recv_channel {
# port = 61094
#}
/* Optional sFlow settings */
#sflow {
# udp_port = 61094
# accept_vm_metrics = yes
# accept_jvm_metrics = yes
# multiple_jvm_instances = no
# accept_http_metrics = yes
# multiple_http_instances = no
# accept_memcache_metrics = yes
# multiple_memcache_instances = no
#}
/* Each metrics module that is referenced by gmond must be specified and
loaded. If the module has been statically linked with gmond, it does
not require a load path. However all dynamically loadable modules must
include a load path. */
modules {
module {
name = "core_metrics"
}
module {
name = "cpu_module"
path = "modcpu.so"
}
module {
name = "disk_module"
path = "moddisk.so"
}
module {
name = "load_module"
path = "modload.so"
}
module {
name = "mem_module"
path = "modmem.so"
}
module {
name = "net_module"
path = "modnet.so"
}
module {
name = "proc_module"
path = "modproc.so"
}
module {
name = "sys_module"
path = "modsys.so"
}
}
/* The old internal 2.5.x metric array has been replaced by the following
collection_group directives. What follows is the default behavior for
collecting and sending metrics that is as close to 2.5.x behavior as
possible. */
/* This collection group will cause a heartbeat (or beacon) to be sent every
20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
the age of the running gmond. */
collection_group {
collect_once = yes
time_threshold = 20
metric {
name = "heartbeat"
}
}
/* This collection group will send general info about this host*/
collection_group {
collect_every = 60
time_threshold = 60
metric {
name = "cpu_num"
title = "CPU Count"
}
metric {
name = "cpu_speed"
title = "CPU Speed"
}
metric {
name = "mem_total"
title = "Memory Total"
}
metric {
name = "swap_total"
title = "Swap Space Total"
}
metric {
name = "boottime"
title = "Last Boot Time"
}
metric {
name = "machine_type"
title = "Machine Type"
}
metric {
name = "os_name"
title = "Operating System"
}
metric {
name = "os_release"
title = "Operating System Release"
}
metric {
name = "location"
title = "Location"
}
}
/* This collection group will send the status of gexecd for this host
every 300 secs.*/
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
collection_group {
collect_once = yes
time_threshold = 300
metric {
name = "gexec"
title = "Gexec Status"
}
}
/* This collection group will collect the CPU status info every 20 secs.
The time threshold is set to 90 seconds. In honesty, this
time_threshold could be set significantly higher to reduce
unneccessary network chatter. */
collection_group {
collect_every = 20
time_threshold = 90
/* CPU status */
metric {
name = "cpu_user"
value_threshold = "1.0"
title = "CPU User"
}
metric {
name = "cpu_system"
value_threshold = "1.0"
title = "CPU System"
}
metric {
name = "cpu_idle"
value_threshold = "5.0"
title = "CPU Idle"
}
metric {
name = "cpu_nice"
value_threshold = "1.0"
title = "CPU Nice"
}
metric {
name = "cpu_aidle"
value_threshold = "5.0"
title = "CPU aidle"
}
metric {
name = "cpu_wio"
value_threshold = "1.0"
title = "CPU wio"
}
metric {
name = "cpu_steal"
value_threshold = "1.0"
title = "CPU steal"
}
/* The next two metrics are optional if you want more detail...
... since they are accounted for in cpu_system.
metric {
name = "cpu_intr"
value_threshold = "1.0"
title = "CPU intr"
}
metric {
name = "cpu_sintr"
value_threshold = "1.0"
title = "CPU sintr"
}
*/
}
collection_group {
collect_every = 20
time_threshold = 90
/* Load Averages */
metric {
name = "load_one"
value_threshold = "1.0"
title = "One Minute Load Average"
}
metric {
name = "load_five"
value_threshold = "1.0"
title = "Five Minute Load Average"
}
metric {
name = "load_fifteen"
value_threshold = "1.0"
title = "Fifteen Minute Load Average"
}
}
/* This group collects the number of running and total processes */
collection_group {
collect_every = 80
time_threshold = 950
metric {
name = "proc_run"
value_threshold = "1.0"
title = "Total Running Processes"
}
metric {
name = "proc_total"
value_threshold = "1.0"
title = "Total Processes"
}
}
/* This collection group grabs the volatile memory metrics every 40 secs and
sends them at least every 180 secs. This time_threshold can be increased
significantly to reduce unneeded network traffic. */
collection_group {
collect_every = 40
time_threshold = 180
metric {
name = "mem_free"
value_threshold = "1024.0"
title = "Free Memory"
}
metric {
name = "mem_shared"
value_threshold = "1024.0"
title = "Shared Memory"
}
metric {
name = "mem_buffers"
value_threshold = "1024.0"
title = "Memory Buffers"
}
metric {
name = "mem_cached"
value_threshold = "1024.0"
title = "Cached Memory"
}
metric {
name = "swap_free"
value_threshold = "1024.0"
title = "Free Swap Space"
}
}
collection_group {
collect_every = 40
time_threshold = 300
metric {
name = "bytes_out"
value_threshold = 4096
title = "Bytes Sent"
}
metric {
name = "bytes_in"
value_threshold = 4096
title = "Bytes Received"
}
metric {
name = "pkts_in"
value_threshold = 256
title = "Packets Received"
}
metric {
name = "pkts_out"
value_threshold = 256
title = "Packets Sent"
}
}
/* Different than 2.5.x default since the old config made no sense */
collection_group {
collect_every = 1800
time_threshold = 3600
metric {
name = "disk_total"
value_threshold = 1.0
title = "Total Disk Space"
}
}
collection_group {
collect_every = 40
time_threshold = 180
metric {
name = "disk_free"
value_threshold = 1.0
title = "Disk Space Available"
}
metric {
name = "part_max_used"
value_threshold = 1.0
title = "Maximum Disk Space Used"
}
}
include ("/etc/ganglia/conf.d/*.conf")