spinlock bug (finally sovle by remove debug option)

本篇博客详细记录了一次spinlock调试修复的过程,包括错误复现、根本原因分析及最终解决方案。作者发现当前任务PID在SMP环境下并非全局唯一,导致spinlock调试代码出现问题。通过将owner_pid替换为指向task_struct的指针解决了该问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

spinlock-debug fix From: Ingo Molnar To: Andrew Morton Subject: [patch] spinlock-debug fix Date: Mon, 27 Jun 2005 11:48:44 +0200 Cc: Reuben Farrelly , linux-kernel@vger.kernel.org Archive-link: Article, Thread * Andrew Morton wrote: > Reuben Farrelly wrote: > > > > > Anyway, scary trace. It look like some spinlock is thought to be in the > > > wrong state in schedule(). Send the .config, please. > > > > Now online at http://www.reub.net/kernel/.config > > Me too. > > BUG: spinlock recursion on CPU#0, swapper/0, c120d520 > [] dump_stack+0x19/0x20 > [] spin_bug+0x42/0x54 > [] _raw_spin_lock+0x3e/0x84 > [] _spin_lock+0x9/0x10 > [] schedule+0x479/0xbc8 > [] cpu_idle+0x88/0x8c > [] rest_init+0x21/0x28 > [] start_kernel+0x151/0x158 > [] 0xc010020f > Kernel panic - not syncing: bad locking > > The bug is in the new spinlock debugging code itself. Ingo, can you > test that .config please? couldnt reproduce it on an UP box, nor on an SMP/HT 2/4-way box, but it finally triggered on a 2-way SMP box. the bug is that current->pid is not a unique identifier on SMP (doh!). The patch below fixes the bug - which also happens to be a speedup for the debugging code, as the ->pid dereferencing does not have to be done anymore. Also, i've disabled the panicing for now. Ingo - change owner_pid to owner, to fix bad pid uniqueness assumption on SMP - some more debug output printed - dont panic for now Signed-off-by: Ingo Molnar include/linux/spinlock_types.h | 16 ++++++++++------ kernel/sched.c | 2 +- lib/spinlock_debug.c | 30 +++++++++++++++++++----------- 3 files changed, 30 insertions(+), 18 deletions(-) Index: linux/include/linux/spinlock_types.h =================================================================== --- linux.orig/include/linux/spinlock_types.h +++ linux/include/linux/spinlock_types.h @@ -21,11 +21,12 @@ typedef struct { unsigned int break_lock; #endif #ifdef CONFIG_DEBUG_SPINLOCK - unsigned int magic, owner_pid, owner_cpu; + unsigned int magic, owner_cpu; + void *owner; #endif } spinlock_t; -#define SPINLOCK_MAGIC 0xdead4ead +#define SPINLOCK_MAGIC 0xdead4ead typedef struct { raw_rwlock_t raw_lock; @@ -33,22 +34,25 @@ typedef struct { unsigned int break_lock; #endif #ifdef CONFIG_DEBUG_SPINLOCK - unsigned int magic, owner_pid, owner_cpu; + unsigned int magic, owner_cpu; + void *owner; #endif } rwlock_t; -#define RWLOCK_MAGIC 0xdeaf1eed +#define RWLOCK_MAGIC 0xdeaf1eed + +#define SPINLOCK_OWNER_INIT ((void *)-1L) #ifdef CONFIG_DEBUG_SPINLOCK # define SPIN_LOCK_UNLOCKED / (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, / .magic = SPINLOCK_MAGIC, / - .owner_pid = -1, / + .owner = SPINLOCK_OWNER_INIT, / .owner_cpu = -1 } #define RW_LOCK_UNLOCKED / (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, / .magic = RWLOCK_MAGIC, / - .owner_pid = -1, / + .owner = SPINLOCK_OWNER_INIT, / .owner_cpu = -1 } #else # define SPIN_LOCK_UNLOCKED / Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c @@ -1604,7 +1604,7 @@ static inline void finish_task_switch(ru prev_task_flags = prev->flags; #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner_pid = current->pid; + rq->lock.owner = current; #endif finish_arch_switch(prev); finish_lock_switch(rq, prev); Index: linux/lib/spinlock_debug.c =================================================================== --- linux.orig/lib/spinlock_debug.c +++ linux/lib/spinlock_debug.c @@ -14,16 +14,24 @@ static void spin_bug(spinlock_t *lock, const char *msg) { static long print_once = 1; + struct task_struct *owner = NULL; if (xchg(&print_once, 0)) { - printk("BUG: spinlock %s on CPU#%d, %s/%d, %p/n", msg, - smp_processor_id(), current->comm, current->pid, lock); + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) + owner = lock->owner; + printk("BUG: spinlock %s on CPU#%d, %s/%d/n", + msg, smp_processor_id(), current->comm, current->pid); + printk(" lock: %p, .magic: %08x, .owner: %s/%d, .owner_cpu: %d/n", + lock, lock->magic, + owner ? owner->comm : "", + owner ? owner->pid : -1, + lock->owner_cpu); dump_stack(); #ifdef CONFIG_SMP /* * We cannot continue on SMP: */ - panic("bad locking"); +// panic("bad locking"); #endif } } @@ -33,7 +41,7 @@ static void spin_bug(spinlock_t *lock, c static inline void debug_spin_lock_before(spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner_pid == current->pid, lock, "recursion"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), lock, "cpu recursion"); } @@ -41,17 +49,17 @@ static inline void debug_spin_lock_befor static inline void debug_spin_lock_after(spinlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); - lock->owner_pid = current->pid; + lock->owner = current; } static inline void debug_spin_unlock(spinlock_t *lock) { SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); SPIN_BUG_ON(!spin_is_locked(lock), lock, "already unlocked"); - SPIN_BUG_ON(lock->owner_pid != current->pid, lock, "wrong owner"); + SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner_pid = -1; + lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; } @@ -176,7 +184,7 @@ void _raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner_pid == current->pid, lock, "recursion"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), lock, "cpu recursion"); } @@ -184,16 +192,16 @@ static inline void debug_write_lock_befo static inline void debug_write_lock_after(rwlock_t *lock) { lock->owner_cpu = raw_smp_processor_id(); - lock->owner_pid = current->pid; + lock->owner = current; } static inline void debug_write_unlock(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner_pid != current->pid, lock, "wrong owner"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), lock, "wrong CPU"); - lock->owner_pid = -1; + lock->owner = SPINLOCK_OWNER_INIT; lock->owner_cpu = -1; }

### 自旋锁(Spin Lock)的实现原理及使用场景 #### 1. 自旋锁的实现原理 自旋锁是一种轻量级的同步原语,用于保护共享资源的访问。其核心思想是通过让线程在一个循环中不断检查锁的状态,直到成功获取锁为止,这种行为被称为“自旋”。以下是其实现的关键步骤: - **获取锁**: 当一个线程尝试获取锁时,它会检查锁的状态。如果锁未被占用,则直接获取锁并标记为已占用。如果锁已被占用,则线程进入一个循环(即“自旋”),持续检查锁的状态,直到锁被释放[^4]。 ```c static inline void spin_lock(spinlock_t *lock) { while (!spin_trylock(lock)) { // 尝试获取锁,失败则继续循环 cpu_relax(); // 提示编译器和处理器优化自旋等待 } } ``` - **释放锁**: 当持有锁的线程完成对共享资源的操作后,它会释放锁,允许其他线程获取锁。释放锁的过程通常涉及清除锁的状态标志,并确保内存屏障以保证多核环境下的正确性[^1]。 ```c static inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } ``` - **内存屏障与多核同步**: 在多核系统中,为了确保不同CPU之间的内存可见性,自旋锁的实现通常会包含内存屏障指令(如 `smp_mb()` 或 `dsb_sev()`)。这些指令确保在释放锁之前,所有对该共享资源的修改都已完成并写入内存[^5]。 #### 2. 自旋锁的使用场景 自旋锁适用于以下场景: - **短时间持有锁**: 自旋锁最适合于那些持有锁的时间非常短的场景。在这种情况下,自旋锁可以避免线程进入内核态(如使用系统调用的休眠和唤醒),从而提供更高的性能[^2]。 - **低延迟要求**: 在需要低延迟的场景中,自旋锁可以减少线程切换带来的开销。例如,在实时系统或高性能计算中,自旋锁可以确保线程快速响应共享资源的变化[^3]。 - **高竞争环境**: 在多核系统中,当多个线程频繁竞争同一资源时,自旋锁可能比传统的互斥锁更高效。尽管自旋锁会消耗CPU资源,但在高竞争环境下,线程切换的开销可能会更大[^3]。 #### 3. 自旋锁的优缺点 - **优点**: - 实现简单,开销小。 - 在短时间持有锁的情况下,性能优于互斥锁。 - 避免了线程切换带来的开销。 - **缺点**: - 如果锁的持有时间过长,会导致CPU资源浪费。 - 不适合单核系统或资源受限的环境,因为自旋会占用CPU时间[^5]。 #### 4. 示例代码 以下是一个简单的自旋锁实现示例: ```c #include <stdatomic.h> #include <stdbool.h> typedef struct { atomic_bool locked; } spinlock_t; void spin_lock(spinlock_t *lock) { while (atomic_exchange(&lock->locked, true)) { // 尝试获取锁 cpu_relax(); // 自旋等待 } } void spin_unlock(spinlock_t *lock) { atomic_store(&lock->locked, false); // 释放锁 } ``` ###
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值