Timer源代码分析
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
/*
* Have the 32 bit jiffies value wrap 5 minutes after boot
* so jiffies wrap bugs show up earlier.
*/
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
一般在嵌入式的平台上,HZ取值为100,所以300*HZ的值为30000。30000的十六进制表示为0x7530。数据都是以补码的形式存储的,正数的补码是它本身,负数的补码是取反再加1。0x7530取反后是0xffff8acf,再加1后是0xffff8ad0。由于经过了强制类型转换(unsigned long)(unsigned int),所以jiffies_64被赋值为0xffff8ad0
/*
* per-CPU timer vector definitions:
*/
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
struct tvec {
struct list_head vec[TVN_SIZE];
};
struct tvec_root {
struct list_head vec[TVR_SIZE];
};
struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
unsigned long timer_jiffies;
unsigned long next_timer;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
struct tvec tv4;
struct tvec tv5;
} ____cacheline_aligned;
struct tvec_base boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
这段代码是per-CPU定时器矢量的定义。
一般在内存比较少的平台才会定义CONFIG_BASE_SMALL为1,以便缩减内存的占用。在我们的平台上,CONFIG_BASE_SMALL定义为0。所以TVN_BITS为6,TVR_BITS为8,TVN_SIZE为26 =64,TVN_SIZE为28=256,TVN_MASK为0X3F,TVR_MASK为0XFF。
struct tvec 是一个有64个链表头的数组,struct tvec_base是一个有256个链表头的数组。
struct tvec_base结构体中:
lock是个自旋锁,操作struct tvec_base结构体中的变量值时都必须持有锁,以免引起同步等问题。
running_timer是本地cpu正在运行的定时器。
timer_jiffies 表示需要检查的动态定时器的最早到期时间。该字段在系统启动时被设置成jiffies的值,且只能由run_timer_softirq()函数增加它的值。一般情况下,它的值bijiffies的值大1。
next_timer是本地cpu最早到期的定时器。
tv1是包含256个list_head元素的数组,包含了在28-1个节拍内将要到期的所有动态定时器。
tv2是包含64个list_head元素的数组,包含了在214-1个节拍内将要到期的所有动态定时器。
tv3是包含64个list_head元素的数组,包含了在220-1个节拍内将要到期的所有动态定时器。
tv4是包含64个list_head元素的数组,包含了在226-1个节拍内将要到期的所有动态定时器。
tv5 与前面的字段几乎相同,但唯一的区别就是vec数组的最后一项是一个大expires字段值的动态定时器。
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
}
static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
{
return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
}
static inline void timer_set_deferrable(struct timer_list *timer)
{
timer->base = TBASE_MAKE_DEFERRED(timer->base);
}
static inline void
timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
{
timer->base = (struct tvec_base *)((unsigned long)(new_base) |
tbase_get_deferrable(timer->base));
}
/*
* Note that all tvec_bases are 2 byte aligned and lower bit of
* base in timer_list is guaranteed to be zero. Use the LSB to
* indicate whether the timer is deferrable.
*
* A deferrable timer will work normally when the system is busy, but
* will not cause a CPU to come out of idle just to service it; instead,
* the timer will be serviced when the CPU eventually wakes up with a
* subsequent non-deferrable timer.
*/
#define TBASE_DEFERRABLE_FLAG (0x1)
刚看这段代码时看得一头雾水,竟然绕个弯来判断定时器是否是可延迟的。它的原理是这样的:tvec_bases这个数据结构是2个字节对齐的,而timer_list结构体的base字段就是指向tvec_base的地址,所以base的值肯定是2的倍数,即最低位为0。搞不明白为什么要这样设计,增加代码复杂性,难道仅仅是为了省个保存标志位的内存?
#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *) \
((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
这个宏定义的作用是将原来的地址加1,即如果原来timer_list的base字段的地址为0x11223344,调用timer_set_deferrable(struct timer_list *timer)之后timer->base的值变为0x11223345。
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
{
int rem;
unsigned long original = j;
/*
* We don't want all cpus firing their timers at once hitting the
* same lock or cachelines, so we skew each extra cpu with an extra
* 3 jiffies. This 3 jiffies came originally from the mm/ code which
* already did this.
* The skew is done by adding 3*cpunr, then round, then subtract this
* extra offset again.
*/
j += cpu * 3;
rem = j % HZ;
/*
* If the target jiffie is just after a whole second (which can happen
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
* But never round down if @force_up is set.
*/
if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
if (j <= jiffies) /* rounding ate our timeout entirely; */
return original;
return j;
}
/**
* __round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* __round_jiffies() rounds an absolute time in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time, which could lead
* to lock contention or spurious cache line bouncing.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long __round_jiffies(unsigned long j, int cpu)
{
return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* __round_jiffies_relative() rounds a time delta in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time, which could lead
* to lock contention or spurious cache line bouncing.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
unsigned long j0 = jiffies;
/* Use j0 because jiffies might change while we run */
return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
/**
* round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
*
* round_jiffies() rounds an absolute time in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long round_jiffies(unsigned long j)
{
return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
/**
* round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
*
* round_jiffies_relative() rounds a time delta in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long round_jiffies_relative(unsigned long j)
{
return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
* __round_jiffies_up - function to round jiffies up to a full second
* @j: the time in (absolute) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* This is the same as __round_jiffies() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);
/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* This is the same as __round_jiffies_relative() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
unsigned long j0 = jiffies;
/* Use j0 because jiffies might change while we run */
return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
/**
* round_jiffies_up - function to round jiffies up to a full second
* @j: the time in (absolute) jiffies that should be rounded
*
* This is the same as round_jiffies() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long round_jiffies_up(unsigned long j)
{
return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);
/**
* round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
*
* This is the same as round_jiffies_relative() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long round_jiffies_up_relative(unsigned long j)
{
return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
这部分代码是处理jiffies的向上取整和向下取整问题,作用是为了防止不同cpu上的定时器在同一时间触发,从而导致一些问题。其核心函数是jiffies_common(unsigned long j, int cpu, bool force_up)。它先将j加上cpu*3的值,然后对100取模。如果小于25,就向下取整,如果大于,则向上取整。最后再减去cpu*3的值。刚开始看代码时,我以为最开始的加cpu*3是没必要的,因为取整后再减去cpu*3就可以让每个定时器的到期时间不一样。但后来发现,在cpu数很多的时候,先加cpu*3可以让最后得出的值围绕原来的值上下波动,没加的话就一直对原来的值递减。
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
struct list_head *vec;
if (idx < TVR_SIZE) {
int i = expires & TVR_MASK;
vec = base->tv1.vec + i;
} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
int i = (expires >> TVR_BITS) & TVN_MASK;
vec = base->tv2.vec + i;
} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
vec = base->tv3.vec + i;
} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
vec = base->tv4.vec + i;
} else if ((signed long) idx < 0) {
/*
* Can happen if you add a timer with expires == jiffies,
* or you set a timer to go off in the past
*/
vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
} else {
int i;
/* If the timeout is larger than 0xffffffff on 64-bit
* architectures then we use the maximum timeout:
*/
if (idx > 0xffffffffUL) {
idx = 0xffffffffUL;
expires = idx + base->timer_jiffies;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
}
/*
* Timers are FIFO:
*/
list_add_tail(&timer->entry, vec);
}
internal_add_timer是个比较重要的函数,它将timer加入到tvec_base的链表数组中。
1 将定时器的到期时间与base->timer_jiffies比较,得到一个索引值。根据这个值判断这个定时器属于哪个链表数组。
2 知道定时器属于哪个数组后,还要知道这个定时器属于数值中的哪个链表中。这是由expires中相应位段的值决定的。每个expires都被分成5个位段。0-7,8-13,14-19,20-25,26-31。
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
timer->start_pid = -1;
memset(timer->start_comm, 0, TASK_COMM_LEN);
#endif
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key,
void (*function)(unsigned long),
unsigned long data)
{
timer->function = function;
timer->data = data;
init_timer_on_stack_key(timer, name, key);
timer_set_deferrable(timer);
}
EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
void init_timer_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
debug_init(timer);
__init_timer(timer, name, key);
}
EXPORT_SYMBOL(init_timer_key);
void init_timer_deferrable_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
init_timer_key(timer, name, key);
timer_set_deferrable(timer);
}
EXPORT_SYMBOL(init_timer_deferrable_key);
定时器的初始化比较简单,只是设置下timer_list各成员变量的初始值。注意要将每个定时器的base设置成本地cpu的tvec_bases,因为每个cpu都有属于自己的定时器。
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires,
bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
int ret = 0 , cpu;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
base = lock_timer_base(timer, &flags);
if (timer_pending(timer)) {
detach_timer(timer, 0);
if (timer->expires == base->next_timer &&
!tbase_get_deferrable(timer->base))
base->next_timer = base->timer_jiffies;
ret = 1;
} else {
if (pending_only)
goto out_unlock;
}
debug_activate(timer, expires);
cpu = smp_processor_id();
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
cpu = get_nohz_timer_target();
#endif
new_base = per_cpu(tvec_bases, cpu);
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
* However we can't change timer's base while it is running,
* otherwise del_timer_sync() can't detect that the timer's
* handler yet has not finished. This also guarantees that
* the timer is serialized wrt itself.
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
timer_set_base(timer, NULL);
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
timer_set_base(timer, base);
}
}
timer->expires = expires;
if (time_before(timer->expires, base->next_timer) &&
!tbase_get_deferrable(timer->base))
base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
这个函数的作用是修改一个定时器:
1 先得到这个定时器所属的base,如果这个定时器已经加入了base的某个链表,则在链表中将这个定时器删除。如果这个base->next_timer刚好等于删掉的这个timer,就设置base->next_timer为base->timer_jiffies。
2 得到本地cpu的new_base值,如果与timer原来的base值不相等,说明不是同一个cpu,需要重新设置timer的base字段为new_base。
3 设置timer的expires字段为新的时间到期值。如果这个值比base->next_timer还小,则设置base->next_timer = timer->expires。
4 调用internal_add_timer函数将timer加入到链表中。
static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
/* cascade all the timers from tv up one level */
struct timer_list *timer, *tmp;
struct list_head tv_list;
list_replace_init(tv->vec + index, &tv_list);
/*
* We are removing _all_ timers from the list, so we
* don't have to detach them individually.
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
BUG_ON(tbase_get_base(timer->base) != base);
internal_add_timer(base, timer);
}
return index;
}
static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
unsigned long data)
{
int preempt_count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the timer from inside the
* function that is called from it, this we need to take into
* account for lockdep too. To avoid bogus "held lock freed"
* warnings as well as problems when looking into
* timer->lockdep_map, make a copy and use that here.
*/
struct lockdep_map lockdep_map = timer->lockdep_map;
#endif
/*
* Couple the lock chain with the lock chain at
* del_timer_sync() by acquiring the lock_map around the fn()
* call here and in del_timer_sync().
*/
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
fn(data);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
if (preempt_count != preempt_count()) {
WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
fn, preempt_count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
* chance to survive and extract information. If the
* callback kept a lock held, bad luck, but not worse
* than the BUG() we had.
*/
preempt_count() = preempt_count;
}
}
#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
*
* This function cascades all vectors and executes all expired timer
* vectors.
*/
static inline void __run_timers(struct tvec_base *base)
{
struct timer_list *timer;
spin_lock_irq(&base->lock);
while (time_after_eq(jiffies, base->timer_jiffies)) {
struct list_head work_list;
struct list_head *head = &work_list;
int index = base->timer_jiffies & TVR_MASK;
/*
* Cascade timers:
*/
if (!index &&
(!cascade(base, &base->tv2, INDEX(0))) &&
(!cascade(base, &base->tv3, INDEX(1))) &&
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
list_replace_init(base->tv1.vec + index, &work_list);
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
timer_stats_account_timer(timer);
base->running_timer = timer;
detach_timer(timer, 1);
spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, data);
spin_lock_irq(&base->lock);
}
}
base->running_timer = NULL;
spin_unlock_irq(&base->lock);
}
这几个函数主要处理定时器到期时的处理。分析下__run_timers函数:
1 如果base->timer_jiffies小于jiffies,则先从base->timer_jiffies的低位段开始计算,如果等于0,表示低位段的定时器都运行完了,需要将高位段的定时器搬移到低位段。搬移完后将base->timer_jiffies的值加1。
2 将base->tv1.vec + index链表的定时器摘下来。然后遍历这些链表,将每个定时器的超时函数执行一次。