一次scheduling while atomic异常分析
提前声明,这里只是个人的学习笔记,未多次review校准,错别字和写错之处,请指出,本人会及时修改。
log现场:
BUG: scheduling while atomic: swapper/6/0/0x00010002
<4>[34987.79331428] C6 CPU: 6 PID: 0 Comm: swapper/6 Tainted: G
<4>[34987.79338774] C6 Call trace:
<4>[34987.79340851] C6 dump_backtrace+0xe8/0x108
<4>[34987.79349697] C6 show_stack+0x18/0x28
<4>[34987.79351928] C6 dump_stack_lvl+0x50/0x6c
<4>[34987.79358159] C6 dump_stack+0x18/0x24
<4>[34987.79361736] C6 __schedule_bug+0x5c/0x88
<4>[34987.79365582] C6 __schedule+0x698/0x9d4
<4>[34987.79369043] C6 schedule+0x7c/0xe8
<4>[34987.79371774] C6 schedule_preempt_disabled+0x24/0x40
<4>[34987.79374851] C6 __mutex_lock+0x3ec/0xf04
<4>[34987.79378082] C6 __mutex_lock_slowpath+0x14/0x24
<4>[34987.79381312] C6 mutex_lock+0x30/0xd8
关键错误log分析: scheduling while atomic preempt_count = 0x10002
Preempt_count原子上下文环境起的作用。
首先这是一种Linux错误,错误的原因是不该在原子上下文环境进行任务切换,此时不应进行schedule。
通常指的是不能在中断上下文,包含软中断和硬中断,以及spinlock临界区里使用了schedule的操作(包含使用了睡眠锁)
本文意在探索:
1.报 scheduling while atomic 原因
2.Preempt_count如何进行add和sub的&对应bit的含义?
3.为什么mutex和rwsema可以支持schedule?
4.spinlock hardirq softirq如何更新preempt_count?
5.如何debug这类问题?
1.报 scheduling while atomic 原因
6117 if (unlikely(in_atomic_preempt_off())) { //#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
||current_thread_info()->preempt.count 取的count,
6118 __schedule_bug(prev);
6119 preempt_count_set(PREEMPT_DISABLED);
1. 【/kernel/kernel/sched/core.c】
2.
3. /*
4. * Print scheduling while atomic bug:
5. */
报scheduling while atomic的现场
6. static noinline void __schedule_bug(struct task_struct *prev)
7. {
8. if (oops_in_progress)
9. return;
10.
11. printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
12. prev->comm, prev->pid, preempt_count());
13.
14. debug_show_held_locks(prev);
15. print_modules();
16. if (irqs_disabled())
17. print_irqtrace_events(prev);
18. #ifdef CONFIG_DEBUG_PREEMPT
19. if (in_atomic_preempt_off()) {
20. pr_err("Preemption disabled at:");
21. print_ip_sym(current->preempt_disable_ip);
22. pr_cont("\n");
23. }
24. #endif
25. dump_stack();
26. add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
27. }
第14行代码,涉及到lockdep,是一种跟踪 lock的debug方法。
第15行代码,打印了第二句log。
第17行代码,当irq_disabled的条件下,会调用printk_irqtrace_event() 打印irq相关的信息,前提是定义了CONFIG_TRACE_IRQFLAGS。
第18行~24行代码,在定义了CONFIG_DEBUG_PREEMPT的情况下, 如果在原子操作下disable preempt, 就会打印出最后导致preempt disabled的调用地址IP;
第25行代码,打印出当前current task在内核态的调用堆栈。
2.Preempt_count如何进行add和sub的&对应bit的含义?
Preempt_count:定义与add 和sub.
在log中出现的preempt_count 是定义在与cpu架构相关的struct thread_info结构中的一个成员变量,是为了支持内核抢占(CONFIG_PREEMPT)而引用该字段的。
对于arm64结构,它定义如下:
1. 【/arch/arm/include/asm/thread_info.h】
2.
3.
4. struct thread_info {
5. unsigned long flags; /* low level flags */
6. int preempt_count; /* 0 => preemptable, <0 => bug */
7. mm_segment_t addr_limit; /* address limit */
8. struct task_struct *task; /* main task structure */
11. };
第6行代码,定义preempt_count ;当其值=0时,表示可以抢占;当<0时,表示为bug;当其值>0表示不可抢占。
Preempt_count的32bit被划分成5个部分:抢占计数,软中断计数,硬中断计数,NMI计数,preempt_active(或者Preempt_need_resched)
保留了preempt_need_resched,简称PNR):
Preempt_count对应的bit:
PNR
1bit Preempt_
active NMI
(1bits) Hardirq
(4bits) Softirq
(8bits) Preempt
(8bits)
与preempt_count 相关的宏定义和基础操作函数如下:
203 #define __preempt_count_inc() __preempt_count_add(1)
204 #define __preempt_count_dec() __preempt_count_sub(1)
205
206 #define preempt_count_inc() preempt_count_add(1)
207 #define preempt_count_dec() preempt_count_sub(1)
208
209 #ifdef CONFIG_PREEMPT_COUNT
210
211 #define preempt_disable() \
212 do { \
213 preempt_count_inc(); \
214 barrier(); \
215 } while (0)
1. 【/kernel/include/linux/preempt.h】
2. ===========================================
3.
4. static __always_inline int preempt_count(void)
5. {
6. return current_thread_info()->preempt_count;
7. }
8.
9. /*
10. * The various preempt_count add/sub methods
11. */
12.
44 static inline void __preempt_count_add(int val)
45 {
46 u32 pc = READ_ONCE(current_thread_info()->preempt.count);
47 pc += val;
48 WRITE_ONCE(current_thread_info()->preempt.count, pc);
49
18. static __always_inline void __preempt_count_sub(int val)
19. {
20. *preempt_count_ptr() -= val;
21. }
22.
23. static __always_inline bool __preempt_count_dec_and_test(void)
24. {
25. return !--*preempt_count_ptr() && tif_need_resched();
26. }
第4行代码,函数preempt_count() 获取当前current的thread_info.preempt_count值。
第13行代码,函数__preempt_count_add()把current的thread_info. preempt_count减一;
第18行代码,函数__preempt_count_sub()把current的thread_info. preempt_count增一。
第23行代码,函数__preempt_count_dec_and_test()先current的thread_info. preempt_count减一, 然后判断其是否为0 并且 current task设置了标志TIF_NEED_RESCHED。
25 *
26 * PREEMPT_MASK: 0x000000ff
27 * SOFTIRQ_MASK: 0x0000ff00
28 * HARDIRQ_MASK: 0x000f0000
29 * NMI_MASK: 0x00f00000
30 * PREEMPT_NEED_RESCHED: 0x80000000
31 */
32 #define PREEMPT_BITS 8
33 #define SOFTIRQ_BITS 8
34 #define HARDIRQ_BITS 4
35 #define NMI_BITS 4
49 #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) 1<<0
50 #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) 0x100 1<<8
51 #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 0x10000 1<<16
52 #define NMI_OFFSET (1UL << NMI_SHIFT) 0x100000 1<<20 0x00010002
这里一些函数可以判断当前上下文处于环境。
140 #define in_irq() (hardirq_count())
141 #define in_softirq() (softirq_count())
142 #define in_interrupt() (irq_count())
107 #define nmi_count() (preempt_count() & NMI_MASK)
108 #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
109 #ifdef CONFIG_PREEMPT_RT
110 # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
111 # define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
112 #else
113 # define softirq_count() (preempt_count() & SOFTIRQ_MASK)
114 # define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
115 #endif
#define in_hardirq() (hardirq_count())
3.为什么mutex和rwsema可以支持schedule?
mutex的操作过程
22. preempt_disable(); + 1
84. schedule_preempt_disabled(); -1
__mutex_lock_slowpath --》 __mutex_lock_common调用逻辑如下:
3. /*
4. * Lock a mutex (possibly interruptible), slowpath:
5. */
6. static __always_inline int __sched
7. __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
8. struct lockdep_map *nest_lock, unsigned long ip,
9. struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
10. {
11. struct task_struct *task = current;
12. struct mutex_waiter waiter;
13. unsigned long flags;
14. int ret;
15.
16. if (use_ww_ctx) {
17. struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
18. if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
19. return -EALREADY;
20. }
21.
22. preempt_disable();
23. mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
24.
25. if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
26. /* got the lock, yay! */
27. preempt_enable();
28. return 0;
29. }
30.
31. spin_lock_mutex(&lock->wait_lock, flags);
32.
33. /*
34. * Once more, try to acquire the lock. Only try-lock the mutex if
35. * it is unlocked to reduce unnecessary xchg() operations.
36. */
37. if (!mutex_is_locked(lock) &&
38. (atomic_xchg_acquire(&lock->count, 0) == 1))
39. goto skip_wait;
40.
41. debug_mutex_lock_common(lock, &waiter);
42. debug_mutex_add_waiter(lock, &waiter, task);
43.
44. /* add waiting tasks to the end of the waitqueue (FIFO): */
45. list_add_tail(&waiter.list, &lock->wait_list);
46. waiter.task = task;
47.
48. lock_contended(&lock->dep_map, ip);
49.
50. for (;;) {
51. /*
52. * Lets try to take the lock again - this is needed even if
53. * we get here for the first time (shortly after failing to
54. * acquire the lock), to make sure that we get a wakeup once
55. * it's unlocked. Later on, if we sleep, this is the
56. * operation that gives us the lock. We xchg it to -1, so
57. * that when we release the lock, we properly wake up the
58. * other waiters. We only attempt the xchg if the count is
59. * non-negative in order to avoid unnecessary xchg operations:
60. */
61. if (atomic_read(&lock->count) >= 0 &&
62. (atomic_xchg_acquire(&lock->count, -1) == 1))
63. break;
64.
65. /*
66. * got a signal? (This code gets eliminated in the
67. * TASK_UNINTERRUPTIBLE case.)
68. */
69. if (unlikely(signal_pending_state(state, task))) {
70. ret = -EINTR;
71. goto err;
72. }
73.
74. if (use_ww_ctx && ww_ctx->acquired > 0) {
75. ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
76. if (ret)
77. goto err;
78. }
79.
80. __set_task_state(task, state);
81.
82. /* didn't get the lock, go to sleep: */
83. spin_unlock_mutex(&lock->wait_lock, flags);
84. schedule_preempt_disabled();
#preempt_enable_no_resched(); // 开启抢占,但不立即调度,这里会调用减1.使得mutex不会进入原子上下文环境。
#schedule(); // 调用调度器,选择下一个运行的进程
# preempt_disable(); // 再次关闭抢占,保持原上下文的抢占状态
#
85. spin_lock_mutex(&lock->wait_lock, flags);
86. }
87. __set_task_state(task, TASK_RUNNING);
88.
89. mutex_remove_waiter(lock, &waiter, task);
90. /* set it to 0 if there are no waiters left: */
91. if (likely(list_empty(&lock->wait_list)))
92. atomic_set(&lock->count, 0);
93. debug_mutex_free_waiter(&waiter);
94.
95. skip_wait:
96. /* got the lock - cleanup and rejoice! */
97. lock_acquired(&lock->dep_map, ip);
98. mutex_set_owner(lock);
99.
100. if (use_ww_ctx) {
101. struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
102. ww_mutex_set_context_slowpath(ww, ww_ctx);
103. }
104.
105. spin_unlock_mutex(&lock->wait_lock, flags);
106. preempt_enable();
107. return 0;
108.
109. err:
110. mutex_remove_waiter(lock, &waiter, task);
111. spin_unlock_mutex(&lock->wait_lock, flags);
112. debug_mutex_free_waiter(&waiter);
113. mutex_release(&lock->dep_map, 1, ip);
114. preempt_enable();
115. return ret;
116. }
这个函数比较长,我们只关注和preempt相关的代码,其他部分代码先忽略。
rwsema schedule 对于preempt_count的操作过程
rwsem_down_read_slowpath 调用的函数 schedule_preempt_disabled 先减去1 再schedule时,
static struct rw_semaphore __sched *
1104 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1105 {
1106 long count;
1107 struct rwsem_waiter waiter;
1108 int null_owner_retries;
1109 DEFINE_WAKE_Q(wake_q);
1110 bool already_on_list = false;
1111
1112 /* do optimistic spinning and steal lock if possible */
1113 if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1114 /* rwsem_optimistic_spin() implies ACQUIRE on success */
1115 trace_android_vh_record_rwsem_lock_starttime(current, jiffies);
1116 return sem;
1117 }
1118
1119 /*
1120 * Optimistic spinning failed, proceed to the slowpath
1121 * and block until we can acquire the sem.
1122 */
1123 waiter.task = current;
1124 waiter.type = RWSEM_WAITING_FOR_WRITE;
1125 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1126 waiter.handoff_set = false;
1127
1128 raw_spin_lock_irq(&sem->wait_lock);
1129
1130 trace_android_vh_alter_rwsem_list_add(
1131 &waiter,
1132 sem, &already_on_list);
1133 if (!already_on_list)
1134 rwsem_add_waiter(sem, &waiter);
1135
1136 /* we're now waiting on the lock */
1137 if (rwsem_first_waiter(sem) != &waiter) {
1138 count = atomic_long_read(&sem->count);
1139
1140 /*
1141 * If there were already threads queued before us and:
1142 * 1) there are no active locks, wake the front
1143 * queued process(es) as the handoff bit might be set.
1144 * 2) there are no active writers and some readers, the lock
1145 * must be read owned; so we try to wake any read lock
1146 * waiters that were queued ahead of us.
1147 */
1148 if (count & RWSEM_WRITER_MASK)
1149 goto wait;
1150
1151 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
1152 ? RWSEM_WAKE_READERS
1153 : RWSEM_WAKE_ANY, &wake_q);
1154
1155 if (!wake_q_empty(&wake_q)) {
1156 /*
1157 * We want to minimize wait_lock hold time especially
1158 * when a large number of readers are to be woken up.
1159 */
1160 raw_spin_unlock_irq(&sem->wait_lock);
1161 wake_up_q(&wake_q);
1162 wake_q_init(&wake_q); /* Used again, reinit */
1163 raw_spin_lock_irq(&sem->wait_lock);
1164 }
1165 } else {
1166 atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1167 }
1168
1169 wait:
1170 trace_android_vh_rwsem_wake(sem);
1171 /* wait until we successfully acquire the lock */
1172 trace_android_vh_rwsem_write_wait_start(sem);
1173 set_current_state(state);
1174 for (null_owner_retries = 0;;) {
1175 if (rwsem_try_write_lock(sem, &waiter)) {
1176 /* rwsem_try_write_lock() implies ACQUIRE on success */
1177 break;
1178 }
1179
1180 raw_spin_unlock_irq(&sem->wait_lock);
1181
1182 if (signal_pending_state(state, current))
1183 goto out_nolock;
1184
1185 /*
1186 * After setting the handoff bit and failing to acquire
1187 * the lock, attempt to spin on owner to accelerate lock
1188 * transfer. If the previous owner is a on-cpu writer and it
1189 * has just released the lock, OWNER_NULL will be returned.
1190 * In this case, we attempt to acquire the lock again
1191 * without sleeping.
1192 */
1193 if (waiter.handoff_set) {
1194 enum owner_state owner_state;
1195
1196 preempt_disable(); <------
1197 owner_state = rwsem_spin_on_owner(sem);
1198 preempt_enable(); <-----
1199
1200 /*
1201 * owner is NULL doesn't guarantee the lock is free.
1202 * An incoming reader will temporarily increment the
1203 * reader count without changing owner and the
1204 * rwsem_try_write_lock() will fails if the reader
1205 * is not able to decrement it in time. Allow 8
1206 * trylock attempts when hitting a NULL owner before
1207 * going to sleep.
1208 */
1209 if ((owner_state == OWNER_NULL) &&
1210 (null_owner_retries < 8)) {
1211 null_owner_retries++;
1212 goto trylock_again;
1213 }
1214 null_owner_retries = 0;
1215 }
1216
1217 schedule();
1218 lockevent_inc(rwsem_sleep_writer);
1219 set_current_state(state);
rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
56 */
6657 void __sched schedule_preempt_disabled(void)
6658 {
6659 sched_preempt_enable_no_resched();
6660 schedule();
6661 preempt_disable();
6662 }
4.spinlock hardirq softirq如何更新preempt_count?
这里讲的是spin lock的实现部分里与preempt_count相关的部分,不涉及spin lock本身的原理。
先看上锁函数,按照函数调用关系表示为:
Spin_lock_irqsave() raw_spin_lock_irqsave() _raw_spin_lock_irqsave() __raw_spin_lock_irqsave(),
1. 【/kernel/include/linux/spinlock_api_smp.h】
2. ===========================================
3.
4. static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
5. {
6. unsigned long flags;
7.
8. local_irq_save(flags);
9. preempt_disable();
10. spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11. /*
12. * On lockdep we dont want the hand-coded irq-enable of
13. * do_raw_spin_lock_flags() code, because lockdep assumes
14. * that interrupts are not re-enabled during lock-acquire:
15. */
16. #ifdef CONFIG_LOCKDEP
17. LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
18. #else
19. do_raw_spin_lock_flags(lock, &flags);
20. #endif
21. return flags;
22. }
第8行代码,local_irq_save(flags),先保存本地cpu中断cpsid到变量flags,再禁止中断。
第9行代码,preempt_disable()函数让当前task对应的thread_info的字段preempt_count增一,表示禁止内核抢占。
第19行代码,获取spinlock的操作。
接下来看看释放spinlock的函数调用,
spin_unlock_irqrestore() raw_spin_unlock_irqrestore() _raw_spin_unlock_irqrestore() __raw_spin_unlock_irqrestore():
1. 【/kernel/include/linux/spinlock_api_smp.h】
2. ===========================================
3.
4. static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
5. unsigned long flags)
6. {
7. spin_release(&lock->dep_map, 1, _RET_IP_);
8. do_raw_spin_unlock(lock);
9. local_irq_restore(flags);
10. preempt_enable();
11. }
第8行代码,释放spinlock;
第9行代码,恢复之前保存的flags值 到 当前cpu的中断;
第10行代码,preempt_enable()让当前task的thread_info.preempt_count减一。
中断的操作
el1_interrupt
|__el1_irq
irq_enter_rcu();
__irq_enter_raw() //preempt_count_add(HARDIRQ_OFFSET) 0x10000
softirq的操作
#ifdef CONFIG_TRACE_IRQFLAGS
300 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
301 {
302 unsigned long flags;
303
304 WARN_ON_ONCE(in_irq());
305
306 raw_local_irq_save(flags);
307 /*
308 * The preempt tracer hooks into preempt_count_add and will break
309 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
310 * is set and before current->softirq_enabled is cleared.
311 * We must manually increment preempt_count here and manually
312 * call the trace_preempt_off later.
313 */
314 __preempt_count_add(cnt); <------
315 /*
316 * Were softirqs turned off above:
317 */
318 if (softirq_count() == (cnt & SOFTIRQ_MASK))
319 lockdep_softirqs_off(ip);
320 raw_local_irq_restore(flags);
321
322 if (preempt_count() == cnt) {
323 #ifdef CONFIG_DEBUG_PREEMPT
324 current->preempt_disable_ip = get_lock_parent_ip();
325 #endif
326 trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
327 }
328 }
265 static inline void ksoftirqd_run_begin(void)
266 {
267 __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
268 local_irq_disable();
269 }
5.如何debug这类问题?
根据死机栈,死磕代码
以及打开CONFIG_DEBUG_SPINLOCK 和 CONFIG_DEBUG_PREEMPT。
el1h_64_irq --
el1h_64_irq_handler
el1_interrupt
|__el1_irq
irq_enter_rcu();
__irq_enter_raw() //preempt_count_add(HARDIRQ_OFFSET) 0x10000
|do_interrupt_handler
|if (on_thread_stack())
279 call_on_irq_stack(regs, handler); <------ 进到中断栈里。
|handle_percpu_devid_irq //中断来了
|arch_timer_handler_phys
|hrtimer_interrupt
|__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
|__run_hrtimer(cpu_base, base, timer, &basenow, flags); //运行的驱动的FN //本质还是中断函数里调用了mutexlock
###
|mutex_lock(&prepare_lock);
753

被折叠的 条评论
为什么被折叠?



