一次scheduling while atomic异常分析

原创于 2025-10-26 14:34:49 发布 · 608 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#java #数据库 #前端

部署运行你感兴趣的模型镜像

一次scheduling while atomic异常分析
提前声明，这里只是个人的学习笔记，未多次review校准，错别字和写错之处，请指出，本人会及时修改。

log现场:
BUG: scheduling while atomic: swapper/6/0/0x00010002
<4>[34987.79331428] C6 CPU: 6 PID: 0 Comm: swapper/6 Tainted: G
<4>[34987.79338774] C6 Call trace:
<4>[34987.79340851] C6 dump_backtrace+0xe8/0x108
<4>[34987.79349697] C6 show_stack+0x18/0x28
<4>[34987.79351928] C6 dump_stack_lvl+0x50/0x6c
<4>[34987.79358159] C6 dump_stack+0x18/0x24
<4>[34987.79361736] C6 __schedule_bug+0x5c/0x88
<4>[34987.79365582] C6 __schedule+0x698/0x9d4
<4>[34987.79369043] C6 schedule+0x7c/0xe8
<4>[34987.79371774] C6 schedule_preempt_disabled+0x24/0x40
<4>[34987.79374851] C6 __mutex_lock+0x3ec/0xf04
<4>[34987.79378082] C6 __mutex_lock_slowpath+0x14/0x24
<4>[34987.79381312] C6 mutex_lock+0x30/0xd8

关键错误log分析: scheduling while atomic preempt_count = 0x10002

Preempt_count原子上下文环境起的作用。
首先这是一种Linux错误，错误的原因是不该在原子上下文环境进行任务切换，此时不应进行schedule。
通常指的是不能在中断上下文，包含软中断和硬中断，以及spinlock临界区里使用了schedule的操作(包含使用了睡眠锁)
本文意在探索:
1.报 scheduling while atomic 原因
2.Preempt_count如何进行add和sub的&对应bit的含义?
3.为什么mutex和rwsema可以支持schedule?
4.spinlock hardirq softirq如何更新preempt_count？
5.如何debug这类问题？

1.报 scheduling while atomic 原因

6117     if (unlikely(in_atomic_preempt_off())) { //#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
||current_thread_info()->preempt.count 取的count，
6118         __schedule_bug(prev);
6119         preempt_count_set(PREEMPT_DISABLED);

1.   【/kernel/kernel/sched/core.c】
2.
3.   /*
4.   * Print scheduling while atomic bug:
5.   */

报scheduling while atomic的现场

6.   static noinline void __schedule_bug(struct task_struct *prev)
7.   {
8.   if (oops_in_progress)
9.   return;
10.
11.   printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
12.   prev->comm, prev->pid, preempt_count());
13.
14.   debug_show_held_locks(prev);
15.   print_modules();
16.   if (irqs_disabled())
17.   print_irqtrace_events(prev);
18.   #ifdef CONFIG_DEBUG_PREEMPT
19.   if (in_atomic_preempt_off()) {
20.   pr_err("Preemption disabled at:");
21.   print_ip_sym(current->preempt_disable_ip);
22.   pr_cont("\n");
23.   }
24.   #endif
25.   dump_stack();
26.   add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
27.   }

第14行代码，涉及到lockdep，是一种跟踪 lock的debug方法。
第15行代码，打印了第二句log。
第17行代码，当irq_disabled的条件下，会调用printk_irqtrace_event() 打印irq相关的信息，前提是定义了CONFIG_TRACE_IRQFLAGS。
第18行～24行代码，在定义了CONFIG_DEBUG_PREEMPT的情况下，如果在原子操作下disable preempt，就会打印出最后导致preempt disabled的调用地址IP；
第25行代码，打印出当前current task在内核态的调用堆栈。

2.Preempt_count如何进行add和sub的&对应bit的含义?
Preempt_count:定义与add 和sub.
在log中出现的preempt_count 是定义在与cpu架构相关的struct thread_info结构中的一个成员变量，是为了支持内核抢占（CONFIG_PREEMPT）而引用该字段的。
对于arm64结构，它定义如下：
1.   【/arch/arm/include/asm/thread_info.h】
2.
3.
4.   struct thread_info {
5.   unsigned long flags; /* low level flags */
6.   int preempt_count; /* 0 => preemptable, <0 => bug */
7.   mm_segment_t addr_limit; /* address limit */
8.   struct task_struct *task; /* main task structure */
11.   };
第6行代码，定义preempt_count ；当其值=0时，表示可以抢占；当<0时，表示为bug；当其值>0表示不可抢占。
   Preempt_count的32bit被划分成5个部分：抢占计数，软中断计数，硬中断计数，NMI计数，preempt_active（或者Preempt_need_resched）
   保留了preempt_need_resched，简称PNR）：
Preempt_count对应的bit:
PNR
1bit       Preempt_
active   NMI
（1bits）   Hardirq
（4bits）   Softirq
（8bits）   Preempt
（8bits）

与preempt_count 相关的宏定义和基础操作函数如下：
203 #define __preempt_count_inc() __preempt_count_add(1)
204 #define __preempt_count_dec() __preempt_count_sub(1)
205
206 #define preempt_count_inc() preempt_count_add(1)
207 #define preempt_count_dec() preempt_count_sub(1)
208
209 #ifdef CONFIG_PREEMPT_COUNT
210
211 #define preempt_disable() \
212 do { \
213 preempt_count_inc(); \
214 barrier(); \
215 } while (0)

1.   【/kernel/include/linux/preempt.h】
2.   ===========================================
3.
4.   static __always_inline int preempt_count(void)
5.   {
6.   return current_thread_info()->preempt_count;
7.   }
8.
9.   /*
10.   * The various preempt_count add/sub methods
11.   */
12.
44 static inline void __preempt_count_add(int val)
45 {
46     u32 pc = READ_ONCE(current_thread_info()->preempt.count);
47     pc += val;
48     WRITE_ONCE(current_thread_info()->preempt.count, pc);
49
18.   static __always_inline void __preempt_count_sub(int val)
19.   {
20.   *preempt_count_ptr() -= val;
21.   }
22.
23.   static __always_inline bool __preempt_count_dec_and_test(void)
24.   {
25.           return !--*preempt_count_ptr() && tif_need_resched();
26.   }
第4行代码，函数preempt_count() 获取当前current的thread_info.preempt_count值。
第13行代码，函数__preempt_count_add()把current的thread_info. preempt_count减一；
第18行代码，函数__preempt_count_sub()把current的thread_info. preempt_count增一。
第23行代码，函数__preempt_count_dec_and_test()先current的thread_info. preempt_count减一，然后判断其是否为0 并且 current task设置了标志TIF_NEED_RESCHED。

25 *
26 * PREEMPT_MASK:   0x000000ff
27 * SOFTIRQ_MASK:   0x0000ff00
28 * HARDIRQ_MASK:   0x000f0000
29 * NMI_MASK:   0x00f00000
30 * PREEMPT_NEED_RESCHED:   0x80000000
31 */
32 #define PREEMPT_BITS   8
33 #define SOFTIRQ_BITS   8
34 #define HARDIRQ_BITS   4
35 #define NMI_BITS   4

49 #define PREEMPT_OFFSET   (1UL << PREEMPT_SHIFT) 1<<0
50 #define SOFTIRQ_OFFSET   (1UL << SOFTIRQ_SHIFT) 0x100 1<<8
51 #define HARDIRQ_OFFSET   (1UL << HARDIRQ_SHIFT) 0x10000 1<<16
52 #define NMI_OFFSET   (1UL << NMI_SHIFT) 0x100000 1<<20 0x00010002

这里一些函数可以判断当前上下文处于环境。
140 #define in_irq()       (hardirq_count())
141 #define in_softirq()       (softirq_count())
142 #define in_interrupt()       (irq_count())

107 #define nmi_count()   (preempt_count() & NMI_MASK)
108 #define hardirq_count()   (preempt_count() & HARDIRQ_MASK)
109 #ifdef CONFIG_PREEMPT_RT
110 # define softirq_count()   (current->softirq_disable_cnt & SOFTIRQ_MASK)
111 # define irq_count()       ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
112 #else
113 # define softirq_count()   (preempt_count() & SOFTIRQ_MASK)
114 # define irq_count()       (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
115 #endif

#define in_hardirq() (hardirq_count())

3.为什么mutex和rwsema可以支持schedule?

mutex的操作过程

22. preempt_disable(); + 1
84. schedule_preempt_disabled(); -1

__mutex_lock_slowpath --》 __mutex_lock_common调用逻辑如下：

3.   /*
4.   * Lock a mutex (possibly interruptible), slowpath:
5.   */
6.   static __always_inline int __sched
7.   __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
8.   struct lockdep_map *nest_lock, unsigned long ip,
9.   struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
10.   {
11.   struct task_struct *task = current;
12.   struct mutex_waiter waiter;
13.   unsigned long flags;
14.   int ret;
15.
16.   if (use_ww_ctx) {
17.   struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
18.   if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
19.   return -EALREADY;
20.   }
21.
22.   preempt_disable();
23.   mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
24.
25.   if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
26.   /* got the lock, yay! */
27.   preempt_enable();
28.   return 0;
29.   }
30.
31.   spin_lock_mutex(&lock->wait_lock, flags);
32.
33.   /*
34.   * Once more, try to acquire the lock. Only try-lock the mutex if
35.   * it is unlocked to reduce unnecessary xchg() operations.
36.   */
37.   if (!mutex_is_locked(lock) &&
38.   (atomic_xchg_acquire(&lock->count, 0) == 1))
39.   goto skip_wait;
40.
41.   debug_mutex_lock_common(lock, &waiter);
42.   debug_mutex_add_waiter(lock, &waiter, task);
43.
44.   /* add waiting tasks to the end of the waitqueue (FIFO): */
45.   list_add_tail(&waiter.list, &lock->wait_list);
46.   waiter.task = task;
47.
48.   lock_contended(&lock->dep_map, ip);
49.
50.   for (;;) {
51.   /*
52.   * Lets try to take the lock again - this is needed even if
53.   * we get here for the first time (shortly after failing to
54.   * acquire the lock), to make sure that we get a wakeup once
55.   * it's unlocked. Later on, if we sleep, this is the
56.   * operation that gives us the lock. We xchg it to -1, so
57.   * that when we release the lock, we properly wake up the
58.   * other waiters. We only attempt the xchg if the count is
59.   * non-negative in order to avoid unnecessary xchg operations:
60.   */
61.   if (atomic_read(&lock->count) >= 0 &&
62.   (atomic_xchg_acquire(&lock->count, -1) == 1))
63.   break;
64.
65.   /*
66.   * got a signal? (This code gets eliminated in the
67.   * TASK_UNINTERRUPTIBLE case.)
68.   */
69.   if (unlikely(signal_pending_state(state, task))) {
70.   ret = -EINTR;
71.   goto err;
72.   }
73.
74.   if (use_ww_ctx && ww_ctx->acquired > 0) {
75.   ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
76.   if (ret)
77.   goto err;
78.   }
79.
80.   __set_task_state(task, state);
81.
82.   /* didn't get the lock, go to sleep: */
83.   spin_unlock_mutex(&lock->wait_lock, flags);
84.   schedule_preempt_disabled();
           #preempt_enable_no_resched(); // 开启抢占，但不立即调度，这里会调用减1.使得mutex不会进入原子上下文环境。
           #schedule(); // 调用调度器，选择下一个运行的进程
           # preempt_disable(); // 再次关闭抢占，保持原上下文的抢占状态
           #
85.   spin_lock_mutex(&lock->wait_lock, flags);
86.   }
87.   __set_task_state(task, TASK_RUNNING);
88.
89.   mutex_remove_waiter(lock, &waiter, task);
90.   /* set it to 0 if there are no waiters left: */
91.   if (likely(list_empty(&lock->wait_list)))
92.   atomic_set(&lock->count, 0);
93.   debug_mutex_free_waiter(&waiter);
94.
95.   skip_wait:
96.   /* got the lock - cleanup and rejoice! */
97.   lock_acquired(&lock->dep_map, ip);
98.   mutex_set_owner(lock);
99.
100.   if (use_ww_ctx) {
101.   struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
102.   ww_mutex_set_context_slowpath(ww, ww_ctx);
103.   }
104.
105.   spin_unlock_mutex(&lock->wait_lock, flags);
106.   preempt_enable();
107.   return 0;
108.
109.   err:
110.   mutex_remove_waiter(lock, &waiter, task);
111.   spin_unlock_mutex(&lock->wait_lock, flags);
112.   debug_mutex_free_waiter(&waiter);
113.   mutex_release(&lock->dep_map, 1, ip);
114.   preempt_enable();
115.   return ret;
116.   }
这个函数比较长，我们只关注和preempt相关的代码，其他部分代码先忽略。

rwsema schedule 对于preempt_count的操作过程

rwsem_down_read_slowpath 调用的函数 schedule_preempt_disabled 先减去1 再schedule时，

static struct rw_semaphore __sched *
1104 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1105 {
1106     long count;
1107     struct rwsem_waiter waiter;
1108     int null_owner_retries;
1109     DEFINE_WAKE_Q(wake_q);
1110     bool already_on_list = false;
1111
1112     /* do optimistic spinning and steal lock if possible */
1113     if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1114         /* rwsem_optimistic_spin() implies ACQUIRE on success */
1115         trace_android_vh_record_rwsem_lock_starttime(current, jiffies);
1116         return sem;
1117     }
1118
1119     /*
1120     * Optimistic spinning failed, proceed to the slowpath
1121     * and block until we can acquire the sem.
1122     */
1123     waiter.task = current;
1124     waiter.type = RWSEM_WAITING_FOR_WRITE;
1125     waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1126     waiter.handoff_set = false;
1127
1128     raw_spin_lock_irq(&sem->wait_lock);
1129
1130     trace_android_vh_alter_rwsem_list_add(
1131                     &waiter,
1132                     sem, &already_on_list);
1133     if (!already_on_list)
1134         rwsem_add_waiter(sem, &waiter);
1135
1136     /* we're now waiting on the lock */
1137     if (rwsem_first_waiter(sem) != &waiter) {
1138         count = atomic_long_read(&sem->count);
1139
1140         /*
1141         * If there were already threads queued before us and:
1142         * 1) there are no active locks, wake the front
1143         * queued process(es) as the handoff bit might be set.
1144         * 2) there are no active writers and some readers, the lock
1145         * must be read owned; so we try to wake any read lock
1146         * waiters that were queued ahead of us.
1147         */
1148         if (count & RWSEM_WRITER_MASK)
1149             goto wait;
1150
1151         rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
1152                     ? RWSEM_WAKE_READERS
1153                     : RWSEM_WAKE_ANY, &wake_q);
1154
1155         if (!wake_q_empty(&wake_q)) {
1156             /*
1157             * We want to minimize wait_lock hold time especially
1158             * when a large number of readers are to be woken up.
1159             */
1160             raw_spin_unlock_irq(&sem->wait_lock);
1161             wake_up_q(&wake_q);
1162             wake_q_init(&wake_q);   /* Used again, reinit */
1163             raw_spin_lock_irq(&sem->wait_lock);
1164         }
1165     } else {
1166         atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1167     }
1168
1169 wait:
1170     trace_android_vh_rwsem_wake(sem);
1171     /* wait until we successfully acquire the lock */
1172     trace_android_vh_rwsem_write_wait_start(sem);
1173     set_current_state(state);
1174     for (null_owner_retries = 0;;) {
1175         if (rwsem_try_write_lock(sem, &waiter)) {
1176             /* rwsem_try_write_lock() implies ACQUIRE on success */
1177             break;
1178         }
1179
1180         raw_spin_unlock_irq(&sem->wait_lock);
1181
1182         if (signal_pending_state(state, current))
1183             goto out_nolock;
1184
1185         /*
1186         * After setting the handoff bit and failing to acquire
1187         * the lock, attempt to spin on owner to accelerate lock
1188         * transfer. If the previous owner is a on-cpu writer and it
1189         * has just released the lock, OWNER_NULL will be returned.
1190         * In this case, we attempt to acquire the lock again
1191         * without sleeping.
1192         */
1193         if (waiter.handoff_set) {
1194             enum owner_state owner_state;
1195
1196             preempt_disable(); <------
1197             owner_state = rwsem_spin_on_owner(sem);
1198             preempt_enable(); <-----
1199
1200             /*
1201             * owner is NULL doesn't guarantee the lock is free.
1202             * An incoming reader will temporarily increment the
1203             * reader count without changing owner and the
1204             * rwsem_try_write_lock() will fails if the reader
1205             * is not able to decrement it in time. Allow 8
1206             * trylock attempts when hitting a NULL owner before
1207             * going to sleep.
1208             */
1209             if ((owner_state == OWNER_NULL) &&
1210             (null_owner_retries < 8)) {
1211                 null_owner_retries++;
1212                 goto trylock_again;
1213             }
1214             null_owner_retries = 0;
1215         }
1216
1217         schedule();
1218         lockevent_inc(rwsem_sleep_writer);
1219         set_current_state(state);

rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)

56 */
6657 void __sched schedule_preempt_disabled(void)
6658 {
6659     sched_preempt_enable_no_resched();
6660     schedule();
6661     preempt_disable();
6662 }

4.spinlock hardirq softirq如何更新preempt_count？
这里讲的是spin lock的实现部分里与preempt_count相关的部分，不涉及spin lock本身的原理。
先看上锁函数，按照函数调用关系表示为：
Spin_lock_irqsave（）  raw_spin_lock_irqsave（）  _raw_spin_lock_irqsave（） __raw_spin_lock_irqsave（），
1.   【/kernel/include/linux/spinlock_api_smp.h】
2.   ===========================================
3.
4.   static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
5.   {
6.   unsigned long flags;
7.
8.   local_irq_save(flags);
9.   preempt_disable();
10.   spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
11.   /*
12.   * On lockdep we dont want the hand-coded irq-enable of
13.   * do_raw_spin_lock_flags() code, because lockdep assumes
14.   * that interrupts are not re-enabled during lock-acquire:
15.   */
16.   #ifdef CONFIG_LOCKDEP
17.   LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
18.   #else
19.   do_raw_spin_lock_flags(lock, &flags);
20.   #endif
21.   return flags;
22.   }
第8行代码，local_irq_save(flags)，先保存本地cpu中断cpsid到变量flags，再禁止中断。
第9行代码，preempt_disable（）函数让当前task对应的thread_info的字段preempt_count增一，表示禁止内核抢占。
第19行代码，获取spinlock的操作。
   接下来看看释放spinlock的函数调用，
spin_unlock_irqrestore（）  raw_spin_unlock_irqrestore（）  _raw_spin_unlock_irqrestore（） __raw_spin_unlock_irqrestore（）：
1.   【/kernel/include/linux/spinlock_api_smp.h】
2.   ===========================================
3.
4.   static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
5.   unsigned long flags)
6.   {
7.   spin_release(&lock->dep_map, 1, _RET_IP_);
8.   do_raw_spin_unlock(lock);
9.   local_irq_restore(flags);
10.   preempt_enable();
11.   }
第8行代码，释放spinlock；
第9行代码，恢复之前保存的flags值到当前cpu的中断；
第10行代码，preempt_enable()让当前task的thread_info.preempt_count减一。

中断的操作
el1_interrupt
   |__el1_irq
irq_enter_rcu();
__irq_enter_raw() //preempt_count_add(HARDIRQ_OFFSET) 0x10000

softirq的操作
#ifdef CONFIG_TRACE_IRQFLAGS
300 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
301 {
302     unsigned long flags;
303
304     WARN_ON_ONCE(in_irq());
305
306     raw_local_irq_save(flags);
307     /*
308     * The preempt tracer hooks into preempt_count_add and will break
309     * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
310     * is set and before current->softirq_enabled is cleared.
311     * We must manually increment preempt_count here and manually
312     * call the trace_preempt_off later.
313     */
314     __preempt_count_add(cnt); <------
315     /*
316     * Were softirqs turned off above:
317     */
318     if (softirq_count() == (cnt & SOFTIRQ_MASK))
319         lockdep_softirqs_off(ip);
320     raw_local_irq_restore(flags);
321
322     if (preempt_count() == cnt) {
323 #ifdef CONFIG_DEBUG_PREEMPT
324         current->preempt_disable_ip = get_lock_parent_ip();
325 #endif
326         trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
327     }
328 }

265 static inline void ksoftirqd_run_begin(void)
266 {
267     __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
268     local_irq_disable();
269 }

5.如何debug这类问题？
根据死机栈，死磕代码
以及打开CONFIG_DEBUG_SPINLOCK 和 CONFIG_DEBUG_PREEMPT。

el1h_64_irq --
el1h_64_irq_handler
el1_interrupt
   |__el1_irq
irq_enter_rcu();
__irq_enter_raw() //preempt_count_add(HARDIRQ_OFFSET) 0x10000

   |do_interrupt_handler
       |if (on_thread_stack())
       279         call_on_irq_stack(regs, handler); <------ 进到中断栈里。
       |handle_percpu_devid_irq //中断来了
       |arch_timer_handler_phys
               |hrtimer_interrupt
               |__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
                   |__run_hrtimer(cpu_base, base, timer, &basenow, flags); //运行的驱动的FN //本质还是中断函数里调用了mutexlock
                   ###
                       |mutex_lock(&prepare_lock);

您可能感兴趣的与本文相关的镜像

ACE-Step

音乐合成

ACE-Step

ACE-Step是由中国团队阶跃星辰（StepFun）与ACE Studio联手打造的开源音乐生成模型。它拥有3.5B参数量，支持快速高质量生成、强可控性和易于拓展的特点。最厉害的是，它可以生成多种语言的歌曲，包括但不限于中文、英文、日文等19种语言