举个例子方便大家理解: 假设 A,B,C 三个节点都通过了过滤,最终 A 因为权重值最大被选中执行操作。 但由于某个原因,操作在 A 上失败了。 默认情况下,nova-scheduler 会重新执行过滤操作(重复次数由 scheduler_max_attempts 选项指定,默认是 3)。 那么这时候 RetryFilter 就会将 A 直接刷掉,避免操作再次失败。 RetryFilter 通常作为第一个 filter。
---- 但是,有些操作失败,是不会进行Retry的。 记录一下遇到的问题,在创建网络时提示抛出PortBindingFailed异常
def _build_resources(self, context, instance, requested_networks,
security_groups, image_meta, block_device_mapping):
.......
try:
......
network_info = self._build_networks_for_instance(context, instance,
requested_networks, security_groups)
......
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
raise
except exception.UnexpectedTaskStateError as e:
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except Exception:
# Because this allocation is async any failures are likely to occur
# when the driver accesses network_info during spawn().
LOG.exception(_LE('Failed to allocate network(s)'),
instance=instance)
msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
这里如果不是前几类异常(InstanceNotFound,UnexpectedDeletingTaskStateError,UnexpectedTaskStateError),就会抛出BuildAbortException异常,而这类异常不会进行Retry。
---- 回退到之前调用_build_resources函数的地方,可以发现,函数的异常捕获都是分类的,如果是BuildAbortException,则直接将return build_results.FAILED,不会尝试。
---- 如果内部调用函数抛出的异常没有在指定异常类型中的,默认抛出BuildAbortException异常。
---- 只有在compute节点上捕获到RescheduledException,才会去给conductor发消息重新调度并创建VM。
@hooks.add_hook('build_instance')
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def _do_build_and_run_instance(self, context, instance, image,
request_spec, filter_properties, admin_password, injected_files,
requested_networks, security_groups, block_device_mapping,
node=None, limits=None):
......
try:
with timeutils.StopWatch() as timer:
self._build_and_run_instance(context, instance, image,
decoded_files, admin_password, requested_networks,
security_groups, block_device_mapping, node, limits,
filter_properties)
LOG.info(_LI('Took %0.2f seconds to build instance.'),
timer.elapsed(), instance=instance)
return build_results.ACTIVE
except exception.RescheduledException as e:
retry = filter_properties.get('retry')
......
self.compute_task_api.build_instances(context, [instance],
image, filter_properties, admin_password,
injected_files, requested_networks, security_groups,
block_device_mapping)
return build_results.RESCHEDULED
再看发现能够抛出RescheduledException异常的地方其实很少。
def _do_validation(context, instance, group_hint):
....
if 'anti-affinity' in group.policies:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
elif 'affinity' in group.policies:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
def _build_and_run_instance(self, context, instance, image, injected_files,
admin_password, requested_networks, security_groups,
block_device_mapping, node, limits, filter_properties):
.......
try:
rt = self._get_resource_tracker(node)
with rt.instance_claim(context, instance, limits):
......
with self._build_resources(context, instance,
requested_networks, security_groups, image_meta,
block_device_mapping) as resources:
......
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
............
except Exception as e:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=six.text_type(e))
注:上面的代码是N版本的代码
---- 但是,有些操作失败,是不会进行Retry的。 记录一下遇到的问题,在创建网络时提示抛出PortBindingFailed异常
def _build_resources(self, context, instance, requested_networks,
security_groups, image_meta, block_device_mapping):
.......
try:
......
network_info = self._build_networks_for_instance(context, instance,
requested_networks, security_groups)
......
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
raise
except exception.UnexpectedTaskStateError as e:
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except Exception:
# Because this allocation is async any failures are likely to occur
# when the driver accesses network_info during spawn().
LOG.exception(_LE('Failed to allocate network(s)'),
instance=instance)
msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
这里如果不是前几类异常(InstanceNotFound,UnexpectedDeletingTaskStateError,UnexpectedTaskStateError),就会抛出BuildAbortException异常,而这类异常不会进行Retry。
---- 回退到之前调用_build_resources函数的地方,可以发现,函数的异常捕获都是分类的,如果是BuildAbortException,则直接将return build_results.FAILED,不会尝试。
---- 如果内部调用函数抛出的异常没有在指定异常类型中的,默认抛出BuildAbortException异常。
---- 只有在compute节点上捕获到RescheduledException,才会去给conductor发消息重新调度并创建VM。
@hooks.add_hook('build_instance')
@wrap_exception()
@reverts_task_state
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def _do_build_and_run_instance(self, context, instance, image,
request_spec, filter_properties, admin_password, injected_files,
requested_networks, security_groups, block_device_mapping,
node=None, limits=None):
......
try:
with timeutils.StopWatch() as timer:
self._build_and_run_instance(context, instance, image,
decoded_files, admin_password, requested_networks,
security_groups, block_device_mapping, node, limits,
filter_properties)
LOG.info(_LI('Took %0.2f seconds to build instance.'),
timer.elapsed(), instance=instance)
return build_results.ACTIVE
except exception.RescheduledException as e:
retry = filter_properties.get('retry')
......
self.compute_task_api.build_instances(context, [instance],
image, filter_properties, admin_password,
injected_files, requested_networks, security_groups,
block_device_mapping)
return build_results.RESCHEDULED
再看发现能够抛出RescheduledException异常的地方其实很少。
def _do_validation(context, instance, group_hint):
....
if 'anti-affinity' in group.policies:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
elif 'affinity' in group.policies:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
def _build_and_run_instance(self, context, instance, image, injected_files,
admin_password, requested_networks, security_groups,
block_device_mapping, node, limits, filter_properties):
.......
try:
rt = self._get_resource_tracker(node)
with rt.instance_claim(context, instance, limits):
......
with self._build_resources(context, instance,
requested_networks, security_groups, image_meta,
block_device_mapping) as resources:
......
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
............
except Exception as e:
......
raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=six.text_type(e))
注:上面的代码是N版本的代码