aodh的告警计算模块为aodh-evaluator
1、aodh的启动,源码位于 aodh/aodh/evaluator/__init__.py 中的start
def start(self):
super(AlarmEvaluationService,
self).start()
self.partition_coordinator.start()
self.partition_coordinator.join_group(self.PARTITIONING_GROUP_NAME)
# allow time for coordination if necessary
delay_start =
self.partition_coordinator.is_active()
if self.evaluators:
interval = self.conf.evaluation_interval
self.tg.add_timer(
interval,
self._evaluate_assigned_alarms,
initial_delay=interval
if delay_start else
None)
2、告警分配 _evaluate_assigned_alarms,该功能应该是参考ceilometer的告警分配机制,每个计算模块,分配一定的告警规则
def
_evaluate_assigned_alarms(self):
try:
alarms = self._assigned_alarms()
LOG.info(_('initiating evaluation cycle on %d alarms') %
len(alarms))
for
alarm in
alarms:
self._evaluate_alarm(alarm)
except
Exception:
LOG.exception(_('alarm evaluation cycle failed'))
3、计算告警 _evaluate_alarm状态
def
_evaluate_alarm(self, alarm):
"""Evaluate the alarms assigned to this evaluator."""
if
alarm.type not in
self.evaluators:
LOG.debug('skipping alarm %s: type unsupported', alarm.alarm_id)
return
LOG.debug('evaluating alarm %s',
alarm.alarm_id)
try:
self.evaluators[alarm.type].obj.evaluate(alarm)
except
Exception:
LOG.exception(_('Failed to evaluate alarm %s'), alarm.alarm_id)
其中:evaluators 从entry_point中获取
EVALUATOR_EXTENSIONS_NAMESPACE = "aodh.evaluator"
self.evaluators = extension.ExtensionManager(
namespace=self.EVALUATOR_EXTENSIONS_NAMESPACE,
invoke_on_load=True,
invoke_args=(self.conf,))
在entry_points中,定义了如下扩展接口,分别表示不同的告警状态计算方式
[aodh.evaluator]
gnocchi_aggregation_by_metrics_threshold = aodh.evaluator.gnocchi:GnocchiAggregationMetricsThresholdEvaluator
combination = aodh.evaluator.combination:CombinationEvaluator
composite = aodh.evaluator.composite:CompositeEvaluator
gnocchi_resources_threshold = aodh.evaluator.gnocchi:GnocchiResourceThresholdEvaluator
gnocchi_aggregation_by_resources_threshold = aodh.evaluator.gnocchi:GnocchiAggregationResourcesThresholdEvaluator
threshold = aodh.evaluator.threshold:ThresholdEvaluator
当采用ceilometer alarm-gnocchi-resources-threshold-create创建告警的时候,告警类型是gnocchi_resources_threshold,统计信息的获取是调用gnocchi的api
如果采用ceilometer alarm-create,默认的告警类型是threshold,统计信息的获取是通过ceilometer
4、gnocchi_resources_threshold 对应的告警计算源码位于 aodh/evaluator/gnocchi.py
__statistics为实现每个资源的统计信息的获取方式class GnocchiAggregationResourcesThresholdEvaluator(GnocchiBase): def _statistics(self, rule, start, end): # FIXME(sileht): In case of a heat autoscaling stack decide to # delete an instance, the gnocchi metrics associated to this # instance will be no more updated and when the alarm will ask # for the aggregation, gnocchi will raise a 'No overlap' # exception. # So temporary set 'needed_overlap' to 0 to disable the # gnocchi checks about missing points. For more detail see: # https://bugs.launchpad.net/gnocchi/+bug/1479429 try: return self._gnocchi_client.metric.aggregation( metrics=rule['metric'], query=jsonutils.loads(rule['query']), resource_type=rule["resource_type"], start=start, stop=end, aggregation=rule['aggregation_method'], needed_overlap=0, ) except Exception: LOG.exception(_('alarm stats retrieval failed')) return []5、GnocchiAggregationResourcesThresholdEvaluator的父类GnocchiBase代码如下:
_sanitize根据告警规则,获取相应的统计周期下的数据:
class
GnocchiBase(threshold.ThresholdEvaluator):
def
__init__(self,
conf):
super(GnocchiBase,
self).__init__(conf)
self._gnocchi_client = client.Client(
'1', keystone_client.get_session(conf),
interface=conf.service_credentials.interface,
region_name=conf.service_credentials.region_name,
endpoint_override=conf.gnocchi_url)
@staticmethod
def
_sanitize(rule, statistics):
"""Return the datapoints that correspond to the alarm granularity"""
#
TODO(sileht): if there's no direct match, but there is an archive
# policy with granularity that's an even divisor or the period,
# we could potentially do a mean-of-means (or max-of-maxes or whatever,
# but not a stddev-of-stddevs).
#
TODO(sileht): support alarm['exclude_outliers']
LOG.debug('sanitize stats %s',
statistics)
statistics = [stats[2]
for
stats in
statistics
if
stats[1] == rule['granularity']]
statistics = statistics[-rule['evaluation_periods']:]
LOG.debug('pruned statistics to %d',
len(statistics))
return
statistics
源码位于 aodh/evaluator/threshold.py中
def evaluate(self, alarm):
if not self.within_time_constraint(alarm):
LOG.debug('Attempted to evaluate alarm %s, but it is not '
'within its time constraint.', alarm.alarm_id)
return
state, trending_state, statistics, outside_count =
self.evaluate_rule(
alarm.rule)
self._transition_alarm(alarm, state, trending_state, statistics,
outside_count)
7、计算规则:
def
evaluate_rule(self, alarm_rule):
"""Evaluate alarm rule.
:returns:
state, trending state and statistics.
"""
start, end =
self._bound_duration(alarm_rule)
statistics = self._statistics(alarm_rule, start, end)
statistics = self._sanitize(alarm_rule, statistics)
sufficient = len(statistics) >= alarm_rule['evaluation_periods']
if not
sufficient:
return
evaluator.UNKNOWN,
None, statistics,
len(statistics)
def
_compare(value):
op = COMPARATORS[alarm_rule['comparison_operator']]
limit = alarm_rule['threshold']
LOG.debug('comparing value %(value)s against threshold'
' %(limit)s', {'value':
value, 'limit': limit})
return
op(value, limit)
compared = list(six.moves.map(_compare, statistics))
distilled = all(compared)
unequivocal = distilled
or not
any(compared)
number_outside = len([c
for
c in
compared if
c])
if
unequivocal:
state = evaluator.ALARM
if
distilled else
evaluator.OK
return
state, None, statistics, number_outside
else:
trending_state = evaluator.ALARM
if
compared[-1]
else
evaluator.OK
return
None, trending_state, statistics, number_outside