浅析linux 内核高精度定时器（hrtimer）实现机制（二）

最新推荐文章于 2025-05-17 15:25:47 发布

_kerneler

最新推荐文章于 2025-05-17 15:25:47 发布

阅读量35

点赞数

文章标签： linux redis 运维

原文链接：https://zhuanlan.zhihu.com/p/544513145

版权

转：https://zhuanlan.zhihu.com/p/544513145
在这里插入图片描述
3.5 定时器的迁移 switch_hrtimer_base
前面提到过了，在正式激活一个高分辨率定时器的时候，有可能对其进行迁移，这个动作是在switch_hrtimer_base函数中完成的：

static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
            int pinned)
{
    struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
    struct hrtimer_clock_base *new_base;
    int basenum = base->index;
 
        /* 获得当前CPU的hrtimer_cpu_base结构体变量 */
    this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        /* 挑选一个新的hrtimer_cpu_base结构体变量 */
    new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        /* 获得新的hrtimer_cpu_base结构体内对应的hrtimer_clock_base结构体变量 */
    new_base = &new_cpu_base->clock_base[basenum];
 
        /* 如果两者不一致则表示需要迁移 */
    if (base != new_base) {
        /* 如果要迁移的定时器就是正在处理的定时器则保持不变 */
        if (unlikely(hrtimer_callback_running(timer)))
            return base;
 
        /* 将定时器的hrtimer_clock_base结构体变量设置为特殊的全局migration_base，表示正在迁移。 */
        WRITE_ONCE(timer->base, &migration_base);
                /* 释放原来hrtimer_cpu_base结构体的自旋锁 */
        raw_spin_unlock(&base->cpu_base->lock);
                /* 获得新hrtimer_cpu_base结构体的自旋锁 */
        raw_spin_lock(&new_base->cpu_base->lock);
 
                /* 不是当前CPU且到期时间早于目的CPU上的所有定时器 */
        if (new_cpu_base != this_cpu_base &&
            hrtimer_check_target(timer, new_base)) {
                        /* 释放刚获得的新hrtimer_cpu_base结构体的自旋锁 */
            raw_spin_unlock(&new_base->cpu_base->lock);
                        /* 获得刚释放的老hrtimer_cpu_base结构体的自旋锁 */
            raw_spin_lock(&base->cpu_base->lock);
                        /* 将定时器激活在当前CPU上 */
            new_cpu_base = this_cpu_base;
                        /* 将定时器的hrtimer_clock_base结构体变量还原 */
            WRITE_ONCE(timer->base, base);
                        /* 重新判断 */
            goto again;
        }
        WRITE_ONCE(timer->base, new_base);
    } else {
                /* 不是当前CPU且到期时间早于目的CPU上的所有定时器 */
        if (new_cpu_base != this_cpu_base &&
            hrtimer_check_target(timer, new_base)) {
                        /* 将定时器激活在当前CPU上 */
            new_cpu_base = this_cpu_base;
                        /* 重新判断 */
            goto again;
        }
    }
    return new_base;
}

该函数首先会调用 get_target_base 函数，试着挑选出一个新的 hrtimer_cpu_base 结构体变量：

static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                     int pinned)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
    if (static_branch_likely(&timers_migration_enabled) && !pinned)
        return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
    return base;
}

在分析（低分辨率）定时器层的时候也有定时器迁移的概念，也是用叫做get_target_base的函数选择新的迁移位置的，并且代码都非常类似。timers_migration_enabled值将在切换到NO_HZ模式时变成True，而退出NO_HZ模式时变成False。这个变量同时也是（低分辨率）定时器层用来判断是否可以迁移的条件。所以只有在切换到NO_HZ模式下，且定时器没有绑死到某个CPU的情况下，才会选择别的CPU上的timer_baseget_nohz_timer_target函数会判断当前的CPU是否处于空闲状态，如果不是空闲状态，那还是返回当前的CPU编号，如果真是空闲的话，会找到最近的一个忙的处理器，并返回其编号。当所有条件有一条不满足时直接返回传入的hrtimer_cpu_base结构体指针变量。

hrtimer_callback_running函数只是用来检查要迁移的定时器是否就是当前正在处理的定时器，也就是检查定时器对应的hrtimer_clock_base结构体中的running字段是否等于自己。

static inline int hrtimer_callback_running(struct hrtimer *timer)
{
	return timer->base->running == timer;
}

hrtimer_check_target 函数用来检查定时器的到期时间是否早于要迁移到的CPU上即将要到期的那个到期时间：

static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
	ktime_t expires;
 
        /* 将到期时间统一转换成单调时间 */
	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
        /* 到期时间是否比指定CPU上的最近到期时间还要早 */
	return expires < new_base->cpu_base->expires_next;
}

如果高分辨率定时器的到期时间比要激活到的CPU上的所有定时器到期时间还要早，且激活到的CPU不是当前CPU，那么如果真的在那个CPU上激活，还要通知那个CPU去重新编程，否则激活过后这个定时器肯定要超时。所以，与其那么麻烦，还不如之间将这个高分辨率定时器激活在当前CPU上算了。而且，这个操作和迁移其实没有关系，哪怕 get_target_base 函数获得的 base 和定时器中指定的 base 是一样的。

还有一点，在迁移的时候，内核会将定时器的 hrtimer_clock_base 结构体变量临时设置成一个全局变量 migration_base 的指针。该变量定义如下：

static struct hrtimer_cpu_base migration_cpu_base = {
	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};

#define migration_base migration_cpu_base.clock_base[0]
可以看到，这个全局变量什么都没做，只是起到占位的作用。这个变量同时也用在获得定时器所属CPU 的 hrtimer_cpu_base 结构体变量中的自旋锁时：

static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                         unsigned long *flags)
{
    struct hrtimer_clock_base *base;
 
    for (;;) {
        base = READ_ONCE(timer->base);
                /* 是否正在迁移 */
        if (likely(base != &migration_base)) {
                        /* 获得hrtimer_cpu_base结构体的自旋锁并关中断 */
            raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
            if (likely(base == timer->base))
                return base;
            /* 定时器被迁移到了另外一个CPU上 */
                        /* 释放hrtimer_cpu_base结构体的自旋锁并开中断 */
            raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
        }
        cpu_relax();
    }
}

可以看到，该函数通过判断定时器的base变量是否等于migration_base的指针来判断是否该定时器正在迁移。这样做可以在没正式加锁之前过滤掉很多情况，从而加快速度。

3.6 查找即将到期定时器 __hrtimer_get_next_event

前面提到了，在一个CPU下的所有高分辨率定时器都是放在对应结构体hrtimer_cpu_base内clock_base数组里面的，每种到期时间类型占用一个数组元素。数组的每个元素是一个hrtimer_clock_base 结构体，里面包含有一个红黑树，记录了所有的定时器。那么，如果想找到某个CPU下所有定时器中的即将到期定时器只需要遍历clock_base中所有包含有定时器的hrtimer_clock_base结构体，取出它们红黑树最左下的节点，找出到期时间最小的那个就可以了。内核代码使用__hrtimer_get_next_event函数来找出即将到期的定时器：

static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
	unsigned int active;
	struct hrtimer *next_timer = NULL;
	ktime_t expires_next = KTIME_MAX;
 
 
        /* 如果当前不在执行软中断处理程序且需要搜索所有“软”定时器 */
	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                /* 查找所有已有“软”定时器的hrtimer_clock_base */
		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
		cpu_base->softirq_next_timer = NULL;
		expires_next = __hrtimer_next_event_base(cpu_base, NULL,
							 active, KTIME_MAX);
 
		next_timer = cpu_base->softirq_next_timer;
	}
 
        /* 如果需要搜索所有“硬”定时器 */
	if (active_mask & HRTIMER_ACTIVE_HARD) {
                /* 查找所有已有“硬”定时器的hrtimer_clock_base */
		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
		cpu_base->next_timer = next_timer;
		expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
							 expires_next);
	}
 
	return expires_next;
}

函数的第一个参数是对应CPU上的hrtimer_cpu_base结构体，第二个参数表示要查找哪种类型的定时器，其主要取值有以下三种：

#define MASK_SHIFT		(HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD	((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT	(HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL	(HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

回想前面的介绍，所有定时器按照到期时间类型分成4种，又按照“软”或“硬”分成了两种。HRTIMER_ACTIVE_HARD表示搜索所有“硬”定时器，
HRTIMER_ACTIVE_SOFT表示搜索所有“软”定时器，
HRTIMER_ACTIVE_ALL表示搜索所有类型的定时器。
如果正在执行软中断处理程序，即使指明了需要搜索所有的“软”定时器也会忽略，这是因为在软中断处理程序hrtimer_run_softirq中，退出之前会调用 hrtimer_update_softirq_timer 函数，更新所有“软”定时器。反正马上也要处理了，因此在这里就可以忽略掉了。

在函数 __hrtimer_get_next_event 调用的过程中，会更改传入的 hrtimer_cpu_base 结构体的next_timer 或 softirq_next_timer 变量，但不会更改 expires_next 和 softirq_expires_next 变量，而是通过返回值返回。

__hrtimer_get_next_event 函数会通过调用 __hrtimer_next_event_base 函数，找出指定某些hrtimer_clock_base 中，最近即将到期的定时器和到期时间：

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
					 const struct hrtimer *exclude,
					 unsigned int active,
					 ktime_t expires_next)
{
	struct hrtimer_clock_base *base;
	ktime_t expires;
 
        /* 根据位图参数遍历所有hrtimer_clock_base结构体 */
	for_each_active_base(base, cpu_base, active) {
		struct timerqueue_node *next;
		struct hrtimer *timer;
 
                /* 获得红黑树的最左下节点 */
		next = timerqueue_getnext(&base->active);
                /* 根据红黑树节点获得对应的hrtimer结构体 */
		timer = container_of(next, struct hrtimer, node);
                /* 如果找到的定时器是要被排除的则继续找下一个定时器 */
		if (timer == exclude) {
			next = timerqueue_iterate_next(next);
			if (!next)
				continue;
 
			timer = container_of(next, struct hrtimer, node);
		}
                /* 统一调整到期时间为单调时间 */
		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
		if (expires < expires_next) {
			expires_next = expires;
 
			/* 如果指定了要排除某个定时器则不更新hrtimer_cpu_base结构体 */
			if (exclude)
				continue;
 
			if (timer->is_soft)
				cpu_base->softirq_next_timer = timer;
			else
				cpu_base->next_timer = timer;
		}
	}
	/* 到期时间不能小于0 */
	if (expires_next < 0)
		expires_next = 0;
	return expires_next;
}

exclude参数表示是否要排除掉某个定时器，一般指定为NULL；
active参数是一个位图，表示要查找哪些hrtimer_clock_base结构体内的定时器；
参数expires_next表示到期时间，
只有在 __hrtimer_next_event_base 函数中找到的定时器到期时间小于 expires_next 参数时才会相应更改 hrtimer_cpu_base 结构体并返回。

还有，在查找到期事件的时候，调用的是hrtimer_get_expires函数：

static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
	return timer->node.expires;
}

所以，取的实际上是定时器的“硬”到期时间，后面可以看到在处理到期定时器时，使用的是“软”到期时间，两者有时间差。

3.7 对定时事件设备重编程 hrtimer_reprogram

前面分析高分辨率定时器激活代码的时候提到过，hrtimer_start_range_ns 函数在将定时器成功插入了对应的红黑树后会调用 hrtimer_reprogram 函数对底层的定时事件设备进行重编程：

 static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
    struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
    struct hrtimer_clock_base *base = timer->base;
        /* 将到期时间统一转换成单调时间 */
    ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 
    WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
    /* 如果转换后的单调到期时间小于0则设置成0 */
    if (expires < 0)
        expires = 0;
 
        /* 定时器是否是“软”的 */
    if (timer->is_soft) {
        struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
 
                /* 如果正在执行软中断处理程序则退出 */
        if (timer_cpu_base->softirq_activated)
            return;
 
                /* 如果定时器的到期时间不早于对应CPU上最近要到期的“软”定时器的到期时间则退出 */
        if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
            return;
 
                /* 更新当前CPU的softirq_next_timer和softirq_expires_next变量 */
        timer_cpu_base->softirq_next_timer = timer;
        timer_cpu_base->softirq_expires_next = expires;
 
        if (!ktime_before(expires, timer_cpu_base->expires_next) ||
            !reprogram)
            return;
    }
 
    /* 如果要编程的定时事件设备不属于当前CPU则退出 */
    if (base->cpu_base != cpu_base)
        return;
 
    /* 如果当前CPU正处在hrtimer_interrupt中断处理程序中则退出 */
    if (cpu_base->in_hrtirq)
        return;
 
        /* 如果定时器的到期时间不早于对应CPU上最近要到期的“硬”定时器的到期时间则退出 */
    if (expires >= cpu_base->expires_next)
        return;
 
    /* 更新当前CPU的next_timer和expires_next变量 */
    cpu_base->next_timer = timer;
    cpu_base->expires_next = expires;
 
    /* 如果还没切换到高精度模式或者发现了错误则直接退出 */
    if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
        return;
 
    /* 对下层定时事件设备进行重编程 */
    tick_program_event(expires, 1);
}

可以看到，在还没切换到高精度模式时，是不会对下面的定时事件设备重编程的。因为在低精度模式下，定时事件设备是由 Tick 层管理的，其会将定时事件设备设置为按照一个固定周期触发，如果这时候对其进行重编程就会乱掉。

如果要重新编程的定时器是一个软定时器，并且当前正在执行软中断处理程序，也是直接退出的。因为软中断处理程序在退出之后会再次检查所有的软定时器，并进行重编程，所以这里就不用再处理了。

除了前面提到的激活一个新的高精度定时器时有可能会对定时事件设备进行重编程外，还有另外两个场景也可能会触发重编程。一是，前面也提到了，当要删除一个高精度定时器，且这个定时器是对应CPU上马上就要到期的定时器时；二是，后面会提到，从低精度模式切换到高精度模式时。与前面一种情况不同的是，前者知道要插入的定时器，而后者不知道要根据哪个已有定时器来重编程，因此需要先选出来，对这种场景的处理是由函数hrtimer_force_reprogram实现的：

static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
	ktime_t expires_next;
 
	/* 在所有类型的定时器中查找下一个最近将到期定时器的到期时间 */
	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
 
        /* 如果即将到期的定时器是一个软定时器 */
	if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
		/* 如果当前正在执行软中断处理程序则要在所有“硬”定时器中重选一个 */
		if (cpu_base->softirq_activated)
			expires_next = __hrtimer_get_next_event(cpu_base,
								HRTIMER_ACTIVE_HARD);
		else
                        /* 设置hrtimer_cpu_base的softirq_expires_next */
			cpu_base->softirq_expires_next = expires_next;
	}
 
        /* 如果设置了skip_equal且前后到期时间是一样的则直接退出 */
	if (skip_equal && expires_next == cpu_base->expires_next)
		return;
 
        /* 设置hrtimer_cpu_base的expires_next */
	cpu_base->expires_next = expires_next;
 
	/* 如果还没切换到高精度模式或者发现了错误则直接退出 */
	if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
		return;
 
        /* 对下层定时事件设备进行重编程 */
	tick_program_event(cpu_base->expires_next, 1);
}

hrtimer_force_reprogram 函数会在当前 CPU 上选出一个最合适的即将到期的定时器，然后用该定时器的到期时间对下层定时事件设备进行编程。在选择即将到期的定时器时，函数首先在所有类型的定时器中选一个最早到期的，但如果选出的定时器是软的并且现在正在执行软中断处理程序，则需要重新在所有硬定时器中选择一个最早到期的。

两种类型的场景下，最后都会调用 tick_program_event 函数，真正的对底层定时事件设备进行重编程：

int tick_program_event(ktime_t expires, int force)
{
    struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
        /* 如果参数expires的值是KTIME_MAX表示要停止定时事件设备 */
    if (unlikely(expires == KTIME_MAX)) {
        /* 将定时事件设备的状态切换为CLOCK_EVT_STATE_ONESHOT_STOPPED */
        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
        dev->next_event = KTIME_MAX;
        return 0;
    }
 
        /* 如果当前定时事件设备是处于CLOCK_EVT_STATE_ONESHOT_STOPPED的状态 */
    if (unlikely(clockevent_state_oneshot_stopped(dev))) {
        /* 打开定时事件设备并切换到CLOCK_EVT_STATE_ONESHOT状态 */
        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
    }
 
        /* 对定时事件设备进行重编程 */
    return clockevents_program_event(dev, expires, force);
}

tick_program_event 函数返回0表示编程成功，其它值表示编程失败。force参数表示是否要对定时事件设备进行强制编程。clockevents_program_event 函数已经在定时事件层分析过了，如果force是1的话，表示如果这个定时事件编程出了问题，是不是需要尝试用最小的时间间隔设定该设备。

3.8 周期处理（低精度模式）
在低精度模式下，高分辨率定时器层其实是靠普通的（低分辨率）定时器层驱动的。在分析（低分辨率）定时器层的时候曾经提到，当Tick到来时其处理函数会调用 hrtimer_run_queues 函数通知高分辨率定时器层：

void hrtimer_run_queues(void)
{
	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
	unsigned long flags;
	ktime_t now;
 
        /* 如果已经切换到了高精度模式则直接退出 */
	if (__hrtimer_hres_active(cpu_base))
		return;
 
	/* 检查并切换高精度模式 */
	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
		hrtimer_switch_to_hres();
		return;
	}
 
        /* 获得自旋锁并关中断 */
	raw_spin_lock_irqsave(&cpu_base->lock, flags);
        /* 获得当前时间并更新各种offset */
	now = hrtimer_update_base(cpu_base);
 
        /* 如果当前时间不早于softirq_expires_next表示有软定时器到期了 */
	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
		cpu_base->softirq_expires_next = KTIME_MAX;
                /* 设置标志位softirq_activated表示在执行软中断处理程序 */
		cpu_base->softirq_activated = 1;
                /* 激活HRTIMER_SOFTIRQ软中断 */
		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
	}
 
        /* 处理所有到期的“硬”定时器 */
	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        /* 释放自旋锁并开中断 */
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

每次调用 hrtimer_run_queues 函数的时候，也就是每次Tick到来的时候，都会判断是不是可以切换到高精度模式。如果确实可以切换，那就调用hrtimer_switch_to_hres完成切换并退出。关于切换成高精度模式的代码后面会分析。如果不需要切换，则函数接着调用hrtimer_update_base函数从时间维护层获得当前时间和所有类型时间与单调时间的偏移值，并对应设置到所有的hrtimer_clock_base结构体中的offset变量里：

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
					    offs_real, offs_boot, offs_tai);
 
	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
 
	return now;
}

前面分析的时候可以看到，最终比较到期时间时都是先将不同类型的到期时间根据对应设置好的offset值，转换成单调时间后再统一比较的。

如果当前时间已经不早于（等于或迟于）softirq_expires_next 变量的值了，表明已经有“软”定时器到期了，这时候要激活软中断处理程序，并设置 softirq_activated 标志位为1。软中断处理程序会在适当的时候被执行，处理到期的“软”定时器：

 static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
	unsigned long flags;
	ktime_t now;
 
	hrtimer_cpu_base_lock_expiry(cpu_base);
        /* 获得自旋锁并关中断 */
	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
        /* 获得当前时间并更新各种offset */
	now = hrtimer_update_base(cpu_base);
	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
 
        /* 清空softirq_activated表明不再执行软中断处理程序 */
	cpu_base->softirq_activated = 0;
	hrtimer_update_softirq_timer(cpu_base, true);
 
        /* 释放自旋锁并开中断 */
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
	hrtimer_cpu_base_unlock_expiry(cpu_base);
}

后面可以看到，其实在高精度模式下，也是通过激活 HRTIMER_SOFTIRQ 软中断处理程序来处理“软”定时器的。在软中断处理程序里，首先还是需要调用 hrtimer_update_base 函数获得当前的时间和各种 offset，因为软中断不是立即执行的，执行的时候可能距离激活的时候过了很长时间，所以需要再次更新。更新过之后，调用 __hrtimer_run_queues 函数，并将最后一个参数设置为HRTIMER_ACTIVE_SOFT，表明需要处理“软”定时器：

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
				 unsigned long flags, unsigned int active_mask)
{
	struct hrtimer_clock_base *base;
	unsigned int active = cpu_base->active_bases & active_mask;
 
        /* 遍历所有指定种类的且包含有定时器的hrtimer_clock_base结构体 */
	for_each_active_base(base, cpu_base, active) {
		struct timerqueue_node *node;
		ktime_t basenow;
 
                /* 将单调时间换算成对应类型的时间 */
		basenow = ktime_add(now, base->offset);
 
                /* 循环获取最近要到期的定时器 */
		while ((node = timerqueue_getnext(&base->active))) {
			struct hrtimer *timer;
 
                        /* 根据红黑树节点获得对应的hrtimer结构体 */
			timer = container_of(node, struct hrtimer, node);
 
			/* 如果当前时间早于最近要到期定时器的软到期时间则退出循环 */
			if (basenow < hrtimer_get_softexpires_tv64(timer))
				break;
 
                        /* 对定时器进行处理 */
			__run_hrtimer(cpu_base, base, timer, &basenow, flags);
			if (active_mask == HRTIMER_ACTIVE_SOFT)
				hrtimer_sync_wait_running(cpu_base, flags);
		}
	}
}

该函数的主要目的是处理所有指定类型下（“软”或“硬”）到期的定时器。函数会遍历所有指定类型并且包含有定时器的 hrtimer_clock_base 结构体，对每个结构体先获得其下面最早要到期的那个定时器，也就是获得红黑树的最左边的那个节点，判断定时器的软到期时间是否已经到达，如果到达了，就调用 __run_hrtimer 函数对这个定时器进行处理，并接着循环取下一个要到期的定时器。直到碰到第一个软到期时间还没到的定时器为止，退出循环。前面提到过，高分辨率定时器的到期时间可能是一个范围，这个范围前面的时间点的是“软”到期时间，范围后面的时间点是“硬”到期时间。当前时间只要处在这个范围里面，都可以让该定时器到期，没有超时。但是，如果当前时间超过“硬”到期时间则表示该定时器超时了。内核在处理的时候使用了一个小技巧，在查找并设置定时事件设备的到期时间的时候用的是“硬”到期时间，但是在处理到期定时器的时候却使用的是“软”到期时间，这样可以保证尽量少的调用中断，在一次中断中能处理尽可能多的定时器，同时还能保证它们不超时。

这里我们假设有四个定时器：在这里插入图片描述
如果按照红黑数查找，那么定时器1是最左下的节点，因为它的“硬”到期时间最早，虽然定时器2的“软”到期时间在定时器1的“软”到期时间之前。因此，高分辨率定时器层会用定时器1的“硬”到期时间对定时事件设备进行编程。当事件到期后，会触发处理函数，这个时候可以同时让定时器1、2和3都到期，因为已经超过了它们三的“软”到期时间。

函数__run_hrtimer用来真正处理一个到期定时器：

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
			  struct hrtimer_clock_base *base,
			  struct hrtimer *timer, ktime_t *now,
			  unsigned long flags)
{
	enum hrtimer_restart (*fn)(struct hrtimer *);
	int restart;
 
	lockdep_assert_held(&cpu_base->lock);
 
	debug_deactivate(timer);
        /* 设置running表明正在处理这个定时器 */
	base->running = timer;
 
	raw_write_seqcount_barrier(&base->seq);
 
        /* 将该定时器删除 */
	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        /* 获得定时器的到期处理函数 */
	fn = timer->function;
 
	if (IS_ENABLED(CONFIG_TIME_LOW_RES))
		timer->is_rel = false;
 
        /* 释放自旋锁并开中断 */
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
	trace_hrtimer_expire_entry(timer, now);
        /* 调用定时器的到期处理函数 */
	restart = fn(timer);
	trace_hrtimer_expire_exit(timer);
        /* 获得自旋锁并关中断 */
	raw_spin_lock_irq(&cpu_base->lock);
 
	/* 如果定时器到期函数返回的值表示要重启该定时器 */
	if (restart != HRTIMER_NORESTART &&
	    !(timer->state & HRTIMER_STATE_ENQUEUED))
                /* 将定时器重新激活并插入红黑树 */
		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
 
	raw_write_seqcount_barrier(&base->seq);
 
	WARN_ON_ONCE(base->running != timer);
        /* 处理结束清空running字段 */
	base->running = NULL;
}

该函数会设置对应 hrtimer_clock_base 结构体的 running 字段为当前要处理的定时器，设置它的目的是防止其在处理的过程中被迁移。在后面调用定时器的到期处理函数之前，会临时释放自旋锁并开中断，这样做可以提高性能，因为到期处理函数可能会耗费教长时间，当然在执行完成后还会再次获得自旋锁并关中断。在这段时间期间，如果真的出现了要对该定时器进行迁移的情况，前面分析中可以看到，在迁移函数switch_hrtimer_base中会对running字段进行判断，如果running指向的就是要迁移的定时器，那就放弃迁移。

到期处理函数的返回值有两种：

enum hrtimer_restart {
	HRTIMER_NORESTART,
	HRTIMER_RESTART,
};

HRTIMER_NORESTART表示该定时器是一次性的，到期后就结束了。而HRTIMER_RESTART表示其是周期性的，执行完之后还要再次激活。

在软中断处理函数 hrtimer_run_softirq 执行完 __hrtimer_run_queues 后，需要清空softirq_activated标志位，表明不再执行软中断处理程序了。最后，前面也提到过，会调用hrtimer_update_softirq_timer，重新查找所有“软”定时器，找到即将要到期的定时器，并用那个到期时间调用hrtimer_reprogram函数，对定时事件设备进行编程：

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
	ktime_t expires;
 
	/* 查找出即将到期的“软”定时器 */
	expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
 
	/* 如果没有即将到期的“软”定时其则直接返回 */
	if (expires == KTIME_MAX)
		return;
 
	/* 对底层定时事件设备进行重编程 */
	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

激活完软中断处理程序，hrtimer_run_queues 函数接着也是调用 __hrtimer_run_queues 来处理硬定时器的，只不过 active_mask 参数传递的是 HRTIMER_ACTIVE_HARD。

可以看到，在 hrtimer_run_queues 函数里面处理完所有“硬”定时器后，没有调用任何函数对底层的定时事件设备进行重编程。而在软中断处理程序中，处理完所有“软”定时器后，调用hrtimer_update_softirq_timer 函数时，是会调用 hrtimer_reprogram 函数的。但是，前面分析hrtimer_reprogram 函数的时候也提到，在这个函数内部会判断是否已经切换到高精度模式了，如果没有也是不会去对定时事件设备编程的。软中断处理程序在高精度模式和低精度模式是一样的，因此为了用一套代码，都会调用 hrtimer_reprogram 函数，但实际在低精度模式下不起作用。

3.9 低精度模式切换到高精度模式 hrtimer_switch_to_hres
在前面分析低精度模式下的周期处理函数hrtimer_run_queues的时候提到，每次都会调用如下代码检测并尝试切换到高精度模式：

…
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } …

其中hrtimer_is_hres_enabled函数用来检测是否允许切换到高精度模式：

```c
#ifdef CONFIG_HIGH_RES_TIMERS
......
static inline int hrtimer_is_hres_enabled(void)
{
	return hrtimer_hres_enabled;
}
......
#else
......
static inline int hrtimer_is_hres_enabled(void) { return 0; }
......
#endif


如果编译的时候没有打开CONFIG_HIGH_RES_TIMERS编译选项，则一定返回否。如果打开了该编译选项，就直接返回全局变量hrtimer_hres_enabled的值。hrtimer_hres_enabled是一个全局布尔变量，默认设置成true，但是可以通过启动内核是传递参数highres为否进行关闭：

```c
 static bool hrtimer_hres_enabled __read_mostly  = true;
 
static int __init setup_hrtimer_hres(char *str)
{
	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
 
__setup("highres=", setup_hrtimer_hres);
 
static inline int hrtimer_is_hres_enabled(void)
{
	return hrtimer_hres_enabled;
}

函数的一开始先判断 check_clock 标志的第 0 位是否被置位，如果没有置位，说明系统中没有注册新的时钟设备，那保持现状就可以了，所以函数直接返回0。check_clock 标志的第 0 位是由时间维护层或者Tick层来置位的。

在分析时间维护层切换时钟源的时候，提到在切换函数 timekeeping_notify 中，会调用tick_clock_notify 函数通知 Tick 模拟层：

 void tick_clock_notify(void)
{
	int cpu;
 
	for_each_possible_cpu(cpu)
		set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

该函数会置位系统中所有CPU的tick_sched结构体中的heck_clocks变量的第0位。

在分析Tick层的Tick设备设置和切换的时候，提到在设置新设备的函数tick_check_new_device中，会调用tick_oneshot_notify函数通知Tick模拟层：

void tick_oneshot_notify(void)
{
	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
	set_bit(0, &ts->check_clocks);
}

该函数只会置位代表本 CPU 的 tick_sched 结构体中的 check_clocks 变量的第 0 位。

如果 tick_sched 结构中的 nohz_mode 字段不是 NOHZ_MODE_INACTIVE，表明系统已经切换到其它模式，直接返回。nohz_mode的取值有3种（代码位于kernel/time/tick-sched.h中）：

enum tick_nohz_mode {
NOHZ_MODE_INACTIVE,
NOHZ_MODE_LOWRES,
NOHZ_MODE_HIGHRES,
};
通过调用时间维护层的函数 timekeeping_valid_for_hres 判断是否当前的时钟源设备是高分辨率设备（代码位于kernel/time/timekeeping.c中）：

int timekeeping_valid_for_hres(void)
{
	struct timekeeper *tk = &tk_core.timekeeper;
	unsigned int seq;
	int ret;
 
	do {
		seq = read_seqcount_begin(&tk_core.seq);
 
		ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
	} while (read_seqcount_retry(&tk_core.seq, seq));
 
	return ret;
}

同时调用Tick层的tick_is_oneshot_available函数判断当前的定时事件设备是否支持单次触发模式。

int tick_is_oneshot_available(void)
{
        /* 获取代表当前CPU上定时事件设备的clock_event_device结构体 */
	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
        /* 如果当前CPU上没有定时事件设备或者不支持单次触发模式则返回0 */
	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
		return 0;
        /* 如果当前CPU上的定时事件设备也不支持C3_STOP模式则返回1 */
	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
		return 1;
        /* 如果当前CPU上的定时事件设备支持C3_STOP模式则还要查看Tick广播层 */
	return tick_broadcast_oneshot_available();
}

tick_check_oneshot_change 函数的参数 allow_nohz 表示是否要切换到 NOHZ_MODE_LOWRES模式，如果传入的是true则需要切换，如果传入的是false则表示不切换，直接返回1。所以，如果关闭了高精度模式的话，则会通知Tick模拟层切换到NOHZ_MODE_LOWRES模式。

所以，要切换到高精度模式必须满足以下几个条件：

高精度模式没有被禁止；
没有曾经切换到其它模式下；
当前时钟源设备是高分辨率的；
当前定时事件设备是单次触发的。
满足以上所有条件后，tick_check_oneshot_change函数会返回1，会接着调用hrtimer_switch_to_hres函数真正切换到高精度模式：

 static void hrtimer_switch_to_hres(void)
{
	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
        /* 切换到高精度模式 */
	if (tick_init_highres()) {
		pr_warn("Could not switch to high resolution mode on CPU %u\n",
			base->cpu);
		return;
	}
        /* 设置本CPU对应的hrtimer_cpu_base结构体的hres_active字段表明进入高精度模式 */
	base->hres_active = 1;
	hrtimer_resolution = HIGH_RES_NSEC;
 
        /* 设置Tick模拟层 */
	tick_setup_sched_timer();
	/* 对定时事件设备进行重编程 */
	retrigger_next_event(NULL);
}

该函数调用 tick_init_highres 函数，切换到高精度模式：

int tick_init_highres(void)
{
/* 将定时事件设备的中断处理程序设置成hrtimer_interrupt */
return tick_switch_to_oneshot(hrtimer_interrupt);
}
该函数调用 tick_switch_to_oneshot 函数，将定时事件设备切换到单次触发模式，并将中断到期处理函数设置成 hrtimer_interrupt：

 int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
	struct clock_event_device *dev = td->evtdev;
 
	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
		    !tick_device_is_functional(dev)) {
 
		pr_info("Clockevents: could not switch to one-shot mode:");
		if (!dev) {
			pr_cont(" no tick device\n");
		} else {
			if (!tick_device_is_functional(dev))
				pr_cont(" %s is not functional.\n", dev->name);
			else
				pr_cont(" %s does not support one-shot mode.\n",
					dev->name);
		}
		return -EINVAL;
	}
 
        /* 将当前CPU对应Tick设备的模式切换成TICKDEV_MODE_ONESHOT */
	td->mode = TICKDEV_MODE_ONESHOT;
        /* 设置新的事件处理函数 */
	dev->event_handler = handler;
        /* 将定时事件设备切换到单次触发模式 */
	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        /* 通知Tick广播层切换到单次触发模式 */
	tick_broadcast_switch_to_oneshot();
	return 0;
}

一旦切换到了高精度模式，那底层的定时事件设备就一定会被切换到单次触发模式，而且到期后中断处理程序不再会调用Tick层的 tick_handle_periodic，而是换成了高分辨率定时器层的hrtimer_interrupt 函数。一旦完成了切换也就意味着从此周期触发的 Tick 将不复存在了，如果此时不对底层的定时事件设备进行重编程，那么它就永远不会再次被触发。因此，在切换成功后，还必须要找到最近到期的定时器，并用它的到期事件对定时事件设备进行重编程：

static void retrigger_next_event(void *arg)
{
	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
        /* 检查是否已经切换到高精度模式了 */
	if (!__hrtimer_hres_active(base))
		return;
 
        /* 获得自旋锁 */
	raw_spin_lock(&base->lock);
        /* 获得当前时间并更新各种offset */
	hrtimer_update_base(base);
        /* 对定时事件设备进行重编程 */
	hrtimer_force_reprogram(base, 0);
        /* 释放自旋锁 */
	raw_spin_unlock(&base->lock);
}

3.10 周期处理（高精度模式，hrtimer_interrupt）
一旦切换到高精度模式后，定时事件设备到期后的中断处理程序会调用 hrtimer_interrupt 函数：

void hrtimer_interrupt(struct clock_event_device *dev)
{
	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
	ktime_t expires_next, now, entry_time, delta;
	unsigned long flags;
	int retries = 0;
 
	BUG_ON(!cpu_base->hres_active);
	cpu_base->nr_events++;
	dev->next_event = KTIME_MAX;
 
        /* 获得自旋锁并关中断 */
	raw_spin_lock_irqsave(&cpu_base->lock, flags);
        /* 获得当前时间并更新各种offset */
	entry_time = now = hrtimer_update_base(cpu_base);
retry:
        /* 设置in_hrtirq字段表明正在执行中断处理程序 */
	cpu_base->in_hrtirq = 1;
	cpu_base->expires_next = KTIME_MAX;
 
        /* 如果当前时间不早于softirq_expires_next表示有软定时器到期了 */
	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
		cpu_base->softirq_expires_next = KTIME_MAX;
                /* 设置softirq_activated字段表明已经激活了软中断 */
		cpu_base->softirq_activated = 1;
                /* 激活HRTIMER_SOFTIRQ软中断 */
		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
	}
 
        /* 处理所有到期的“硬”定时器 */
	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
 
	/* 查找出即将到期的定时器 */
	expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
	cpu_base->expires_next = expires_next;
	cpu_base->in_hrtirq = 0;
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
	/* 如果对定时事件设备编程成功则返回 */
	if (!tick_program_event(expires_next, 0)) {
		cpu_base->hang_detected = 0;
		return;
	}
 
	/* 对定时事件设备编程失败 */
	raw_spin_lock_irqsave(&cpu_base->lock, flags);
        /* 再次获得当前时间并更新各种offset */
	now = hrtimer_update_base(cpu_base);
	cpu_base->nr_retries++;
        /* 重试最多3次 */
	if (++retries < 3)
		goto retry;
	
	cpu_base->nr_hangs++;
	cpu_base->hang_detected = 1;
	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
        /* 计算已经在该中断处理程序中耗费了多长时间 */
	delta = ktime_sub(now, entry_time);
	if ((unsigned int)delta > cpu_base->max_hang_time)
		cpu_base->max_hang_time = (unsigned int) delta;
	/* 根据耗费的时间适当延后到期时间，最多100毫秒。 */
	if (delta > 100 * NSEC_PER_MSEC)
		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
	else
		expires_next = ktime_add(now, delta);
        /* 对定时事件设备进行强制编程 */
	tick_program_event(expires_next, 1);
	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}

高精度模式下中断处理程序也是通过直接调用__hrtimer_run_queues函数处理所有“硬”定时器的，并且也是通过激活HRTIMER_SOFTIRQ软中断来处理所有“软”定时器的。不同的是，在高精度模式下，底层的定时事件设备一定是工作在单次触发模式的，所以当到期后一定要对其进行重编程，否则下面所有的流程就中断了。但是，如果程序出错了，导致定时器超时了，那调用tick_program_event函数对定时事件设备编程就会失败，如果重试三次后都是失败，则表示出错了。这时，适当延迟到期事件后，再调用tick_program_event函数，并且是强制对定时事件设备进行编程。

3 使用实例
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>//jiffies在此头文件中定义
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/delay.h>
 
#include <linux/hrtimer.h>
 
#define	HRTIMER0_US	(123 * 1000 * 1000)
 
struct hrtimer g_hrtimer0;
 
 
static enum hrtimer_restart hrtimer_test_fn(struct hrtimer *hrtimer)
{
    pr_err("#### hrtimer timeout: %lu(ms)\n",HRTIMER0_US/1000/1000);
	
    hrtimer_forward_now(hrtimer, ns_to_ktime(HRTIMER0_US));
	return HRTIMER_RESTART;
}
 
static int __init hr_timer_test_init (void)
{
	pr_err("#### hr_timer_test module init...\n");
	
	struct hrtimer *hrtimer = &g_hrtimer0;
	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
	hrtimer->function = hrtimer_test_fn;
	hrtimer_start(hrtimer, ns_to_ktime(HRTIMER0_US),
		      HRTIMER_MODE_REL_PINNED);
    
	return 0;
}
 
static void __exit hr_timer_test_exit (void)
{
	struct hrtimer *hrtimer = &g_hrtimer0;
	hrtimer_cancel(hrtimer);
    pr_err("#### hr_timer_test module exit...\n");
}
 
module_init(hr_timer_test_init);
module_exit(hr_timer_test_exit);
 
MODULE_AUTHOR("hr_timer_test");
MODULE_LICENSE("GPL");