1.1 核心数据结构
在介绍具体的实现之前,先介绍一下相关的数据结构。
Ø struct hrtimer
/** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node. * @_softexpires: the absolute earliest expiry time of the hrtimer. * The time which was given as expiry time when the timer * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @start_site: timer statistics field to store the site where the timer * was started * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; unsigned long state; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; char start_comm[16]; #endif }; |
Ø hrtimer_clock_base
/** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base * @index: clock type index for per_cpu support when moving a * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; int index; clockid_t clockid; struct timerqueue_head active; ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; ktime_t offset; }; |
Ø struct hrtimer_cpu_base
/* * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers * @active_bases: Bitfield to mark bases with active timers * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @clock_base: array of clock bases for this cpu */ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned long active_bases; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; int hang_detected; unsigned long nr_events; unsigned long nr_retries; unsigned long nr_hangs; ktime_t max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; }; |
使用DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases)定义hrtimer_bases,可以管理挂在每一个CPU上的所有hrtimer。每个CPU上的timer list不再使用timer wheel中多级链表的实现方式,而是采用了红黑树(Red-Black Tree)来进行管理。hrtimer_bases的定义如下所示:
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .clock_base = { { .index = CLOCK_REALTIME, .get_time = &ktime_get_real, .resolution = KTIME_LOW_RES, }, { .index = CLOCK_MONOTONIC, .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, } };
|
图3为 hrtimer 如何通过 hrtimer_bases 来管理 hrtimer
图3
每个hrtimer_bases都包含两个clock_base,一个是CLOCK_REALTIME类型的,另一个是CLOCK_MONOTONIC类型的。hrtimer可以选择其中之一来设置timer的expire time, 可以是实际的时间 , 也可以是相对系统运行的时间。在hrtimer_run_queues的处理中,首先要通过hrtimer_bases找到正在执行当前中断的 CPU相关联的clock_base,然后逐个检查每个clock_base上挂的timer是否超时。由于timer 在添加到clock_base上时使用了红黑树,最早超时的timer被放到树的最左侧,因此寻找超时timer的过程非常迅速,找到的所有超时timer会被逐一处理。
1.2 低精度下的hrtimer
在低精度模式下,hrtimer 的核心处理函数是 hrtimer_run_queues,每一次 tick 中断都要执行一次(在tick的中断处理函数中调用update_process_times)。这个函数的调用流程为:
update_process_times
run_local_timers
hrtimer_run_queues
raise_softirq(TIMER_SOFTIRQ)
其中hrtimer_run_queues是对到期的高精度定时器hrtimer的处理。
void hrtimer_run_queues(void)
{
struct timerqueue_node *node;
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
int index, gettime = 1;
//当hrtimer使能高精度时,该函数相当于空函数,不做任何处理直接返回。
if (hrtimer_hres_active())
return;
for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
base = &cpu_base->clock_base[index];
if (!timerqueue_getnext(&base->active))
continue;
if (gettime) {
hrtimer_get_softirq_time(cpu_base);
gettime = 0;
}
raw_spin_lock(&cpu_base->lock);
while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
timer = container_of(node, struct hrtimer, node);
if (base->softirq_time.tv64 <=
hrtimer_get_expires_tv64(timer))
break;
// 移除hrtimer并运行hrtimer的处理函数,更新hrtimer的状态
__run_hrtimer(timer, &base->softirq_time);
}
raw_spin_unlock(&cpu_base->lock);
}
}
可以看出:在未配置高精度模式时,hrtimer的到期由函数hrtimer_run_queues检查。hrtimer_run_queues是在run_local_timers中被调用,而run_local_timers又是在系统时钟中断中被调用。从这里可以看出,与传统的使用时间轮算法的定时器一样,hrtimer在未配置高精度模式时采用了在每一个系统时钟中断中轮循的方式来判断hrtimer是否到期,因此,这里的定时精度为时钟中断轮循的时间间隔。在函数hrtimer_run_queues的开始处,会执行一项检查:
if (hrtimer_hres_active())
return;
所以在配置高精度模式后,这里的hrtimer_run_queues函数相当于空函数,会直接返回。
1.3 高精度下的hrtimer
配置了高精度模式之后,hrtimer的到期由clock_event设备的产生的硬中断处理来调用hrtimer_interrupt函数。注意这里不再采用传统的轮循方式判断定时器是否到期,而是通过设置clock_event_device的中断,在第一个到期的定时器超时的时间点触发一个中断来执行超时操作。所以,这里的定时精度由clock_event_device的计时精度决定。
由于刚启动时没有特别重要的任务要做,因此默认是进入低精度+周期tick的工作模式,之后会根据硬件的配置(如硬件上是否支持高精度timer)和软件的配置(如是否通过命令行参数或者内核配置使能了高精度timer等特性)进行切换。切换过程的发起函数为run_timer_softirq,该函数被TIMER_SOFTIRQ软中断触发。其具体的流程为
run_timer_softirq
hrtimer_run_pending
tick_check_oneshot_change (在这里可能会切换到NOHZ模式,在后面进行分析)
hrtimer_switch_to_hres
在update_process_times中,除了处理处于低精度模式的hrtimer外,还要唤醒 IRQ0的 softIRQ(TIMER_SOFTIRQ)以便执行timer wheel的代码。由于hrtimer子系统的加入,在IRQ0的softIRQ中,还需要通过hrtimer_run_pending检查是否可以将hrtimer切换到高精度模式:
hrtimer 进行精度切换的处理函数
void hrtimer_run_pending(void)
{
if (hrtimer_hres_active())
return;
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
hrtimer_switch_to_hres();
}
每一次触发IRQ0的softIRQ都需要检查一次是否可以将hrtimer切换到高精度,显然是十分低效的,希望将来有更好的方法不用每次都进行检查。
如果可以将hrtimer切换到高精度模式,则调用hrtimer_switch_to_hres函数进行切换。hrtimer 切换到高精度模式的核心函数
static int hrtimer_switch_to_hres(void)
{
int cpu = smp_processor_id();
struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
unsigned long flags;
if (base->hres_active)
return 1;
local_irq_save(flags);
if (tick_init_highres()) {
local_irq_restore(flags);
printk(KERN_WARNING "Could not switch to high resolution "
"mode on CPU %d\n", cpu);
return 0;
}
base->hres_active = 1;
base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
return 1;
}
在这个函数中,首先使用tick_init_highres更新与原来的tick device绑定的时钟事件设备的event handler,例如将在低精度模式下的工作函数tick_handle_periodic或者tick_handle_ periodic_broadcast换成hrtimer_interrupt(它是hrtimer在高精度模式下的timer中断处理函数),同时将tick device的触发模式变为one-shot,即单次触发模式,这是使用dynamic tick或者hrtimer时tick device的工作模式。tick_init_highres通过调用tick_switch_to_oneshot函数来完成上述工作。
具体的代码如下:
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
struct tick_device *td = &__get_cpu_var(tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
//都成立时
if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
!tick_device_is_functional(dev)) {
printk(KERN_INFO "Clockevents: "
"could not switch to one-shot mode:");
if (!dev) {
printk(" no tick device\n");
} else {
if (!tick_device_is_functional(dev))
printk(" %s is not functional.\n", dev->name);
else
printk(" %s does not support one-shot mode.\n",
dev->name);
}
return -EINVAL;
}
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
由于dynamic tick可以随时停止和开始,以不规律的速度产生tick,因此支持one-shot模式的时钟事件设备是必须的;对于hrtimer,由于hrtimer采用事件机制驱动timer前进,因此使用one-shot的触发模式也是顺理成章的。不过这样一来,原本tick device每次执行中断时需要完成的周期性任务如更新jiffies / wall time (do_timer) 以及更新process的使用时间(update_process_times)等工作在切换到高精度模式之后就没有了,因此在执行完tick_init_highres之后紧接着会调用tick_setup_sched_timer函数来完成这部分设置工作。
下面我们就来看一下,中断处理函数hrtimer_interrupt
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event.tv64 = KTIME_MAX;
//保存进入中断处理的时间
entry_time = now = ktime_get();
retry:
expires_next.tv64 = KTIME_MAX;
raw_spin_lock(&cpu_base->lock);
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
* the migration code. This does not affect enqueueing of
* timers which run their callback and need to be requeued on
* this CPU.
*/
cpu_base->expires_next.tv64 = KTIME_MAX;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *base;
struct timerqueue_node *node;
ktime_t basenow;
//active_bases的每一位表示一个CPU是否存在激活的hrtimer
if (!(cpu_base->active_bases & (1 << i)))
continue;
base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
timer = container_of(node, struct hrtimer, node);
//如果basenow小于最早到期的hrtimer的时间,意味着没有hrtimer到期
if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
ktime_t expires;
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
}
__run_hrtimer(timer, &basenow);
}
}
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
if (expires_next.tv64 == KTIME_MAX ||
!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
return;
}
//时钟已经到期,由于一些时间不能及时处理
now = ktime_get();
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
/*
* Give the system a chance to do something else than looping
* here. We stored the entry time, so we know exactly how long
* we spent here. We schedule the next event this amount of
* time away.
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
delta = ktime_sub(now, entry_time);
if (delta.tv64 > cpu_base->max_hang_time.tv64)
cpu_base->max_hang_time = delta;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
*/
if (delta.tv64 > 100 * NSEC_PER_MSEC)
expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
else
expires_next = ktime_add(now, delta);
tick_program_event(expires_next, 1);
printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
ktime_to_ns(delta));
}