C++死锁问题详解

原创于 2025-11-24 07:30:00 发布 · 615 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#c++ #开发语言

C++疑难杂症专栏收录该内容

65 篇文章

订阅专栏

C++死锁问题详解

1. 死锁基本概念

1.1 什么是死锁

死锁是指两个或多个线程在执行过程中，因争夺资源而造成的一种互相等待的现象，若无外力作用，它们都将无法继续执行。

#include <iostream>
#include <thread>
#include <mutex>

class BasicDeadlock {
    std::mutex mutex1, mutex2;
    
public:
    void thread1_work() {
        std::lock_guard<std::mutex> lock1(mutex1);
        std::cout << "Thread 1 acquired mutex1" << std::endl;
        
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        
        std::lock_guard<std::mutex> lock2(mutex2);  // 等待mutex2
        std::cout << "Thread 1 acquired mutex2" << std::endl;
    }
    
    void thread2_work() {
        std::lock_guard<std::mutex> lock2(mutex2);
        std::cout << "Thread 2 acquired mutex2" << std::endl;
        
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        
        std::lock_guard<std::mutex> lock1(mutex1);  // 等待mutex1
        std::cout << "Thread 2 acquired mutex1" << std::endl;
    }
};

void demonstrate_basic_deadlock() {
    BasicDeadlock example;
    
    std::thread t1([&] { example.thread1_work(); });
    std::thread t2([&] { example.thread2_work(); });
    
    t1.join();
    t2.join();  // 可能永远阻塞在这里
}

1.2 死锁的四个必要条件

互斥条件：资源不能被共享，只能由一个线程使用
请求与保持条件：线程持有至少一个资源，并等待获取其他资源
不可剥夺条件：资源只能由持有者释放，不能被强制剥夺
循环等待条件：存在一个线程-资源的循环等待链

2. 常见的死锁场景

2.1 锁顺序不一致

class LockOrderDeadlock {
    std::mutex account_mutex;
    std::mutex log_mutex;
    double balance = 1000.0;
    
public:
    // 方法1：先锁账户，再锁日志
    void withdraw(double amount) {
        std::lock_guard<std::mutex> account_lock(account_mutex);
        
        // 模拟一些处理时间
        std::this_thread::sleep_for(std::chrono::milliseconds(10));
        
        std::lock_guard<std::mutex> log_lock(log_mutex);
        
        if (balance >= amount) {
            balance -= amount;
            std::cout << "Withdrew " << amount << ", balance: " << balance << std::endl;
        }
    }
    
    // 方法2：先锁日志，再锁账户 - 死锁风险！
    void audit() {
        std::lock_guard<std::mutex> log_lock(log_mutex);
        
        // 模拟一些处理时间
        std::this_thread::sleep_for(std::chrono::milliseconds(10));
        
        std::lock_guard<std::mutex> account_lock(account_mutex);
        
        std::cout << "Audit: balance is " << balance << std::endl;
    }
};

2.2 递归锁误用

class RecursiveLockIssue {
    std::recursive_mutex mutex;
    
public:
    void method_a() {
        std::lock_guard<std::recursive_mutex> lock(mutex);
        method_b();  // 直接调用另一个需要锁的方法
        std::cout << "Method A completed" << std::endl;
    }
    
    void method_b() {
        std::lock_guard<std::recursive_mutex> lock(mutex);  // 同一个线程，可以重复获取
        // 但如果是不同的锁，就会死锁
        std::cout << "Method B completed" << std::endl;
    }
    
    // 危险的模式：不同锁的递归调用
    void dangerous_method(std::mutex& other_mutex) {
        std::lock_guard<std::recursive_mutex> lock1(mutex);
        std::lock_guard<std::mutex> lock2(other_mutex);  // 可能死锁
    }
};

2.3 回调函数中的死锁

class CallbackDeadlock {
    std::mutex mutex;
    std::vector<std::function<void()>> callbacks;
    
public:
    void register_callback(std::function<void()> callback) {
        std::lock_guard<std::mutex> lock(mutex);
        callbacks.push_back(callback);
    }
    
    void notify_all() {
        std::lock_guard<std::mutex> lock(mutex);
        
        for (auto& callback : callbacks) {
            // 危险：回调函数可能再次调用register_callback
            callback();  // 可能导致死锁
        }
    }
    
    void problematic_usage() {
        register_callback([this] {
            // 这个回调试图再次获取同一个锁
            register_callback([] { 
                std::cout << "Nested callback" << std::endl; 
            });
        });
        
        notify_all();  // 死锁！
    }
};

2.4 条件变量与死锁

class ConditionVariableDeadlock {
    std::mutex mutex;
    std::condition_variable cv;
    bool data_ready = false;
    std::queue<int> data_queue;
    
public:
    void producer() {
        for (int i = 0; i < 10; ++i) {
            std::unique_lock<std::mutex> lock(mutex);
            
            // 如果队列已满，等待消费者（但这里缺少实际的满条件检查）
            data_queue.push(i);
            data_ready = true;
            
            cv.notify_one();
            
            // 忘记释放锁，或者在不合适的时候持有锁
            std::this_thread::sleep_for(std::chrono::milliseconds(100));
            // 锁仍然被持有，消费者无法获取
        }
    }
    
    void consumer() {
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            
            cv.wait(lock, [this] { return data_ready; });
            
            if (!data_queue.empty()) {
                int value = data_queue.front();
                data_queue.pop();
                std::cout << "Consumed: " << value << std::endl;
            }
            
            if (data_queue.empty()) {
                data_ready = false;
            }
        }
    }
};

3. 死锁检测方法

3.1 运行时死锁检测

#include <stack>
#include <unordered_set>

class DeadlockDetector {
    static thread_local std::stack<std::mutex*> lock_stack;
    static std::mutex detector_mutex;
    static std::unordered_set<std::mutex*> all_locks;
    
public:
    class LockTracker {
        std::mutex* mutex_ptr;
        
    public:
        explicit LockTracker(std::mutex& mutex) : mutex_ptr(&mutex) {
            std::lock_guard<std::mutex> lock(detector_mutex);
            
            // 检查是否会导致死锁
            for (auto* held_lock : all_locks) {
                if (held_lock == mutex_ptr) {
                    // 同一个线程重复获取同一个锁 - 正常情况
                    continue;
                }
                
                // 检查循环等待（简化版本）
                if (is_in_lock_stack(held_lock)) {
                    std::cerr << "POTENTIAL DEADLOCK DETECTED!" << std::endl;
                    print_lock_stack();
                }
            }
            
            all_locks.insert(mutex_ptr);
            lock_stack.push(mutex_ptr);
        }
        
        ~LockTracker() {
            std::lock_guard<std::mutex> lock(detector_mutex);
            if (!lock_stack.empty() && lock_stack.top() == mutex_ptr) {
                lock_stack.pop();
            }
            all_locks.erase(mutex_ptr);
        }
        
    private:
        static bool is_in_lock_stack(std::mutex* mutex) {
            // 简化实现
            std::stack<std::mutex*> temp = lock_stack;
            while (!temp.empty()) {
                if (temp.top() == mutex) return true;
                temp.pop();
            }
            return false;
        }
        
        static void print_lock_stack() {
            std::stack<std::mutex*> temp = lock_stack;
            std::cerr << "Lock stack: ";
            while (!temp.empty()) {
                std::cerr << temp.top() << " -> ";
                temp.pop();
            }
            std::cerr << "current" << std::endl;
        }
    };
};

// 使用包装的锁
class InstrumentedMutex {
    std::mutex real_mutex;
    
public:
    void lock() {
        DeadlockDetector::LockTracker tracker(real_mutex);
        real_mutex.lock();
    }
    
    void unlock() {
        real_mutex.unlock();
    }
};

3.2 静态分析工具

# 使用Clang静态分析器
clang --analyze -Xanalyzer -analyzer-output=text program.cpp

# 使用ThreadSanitizer
g++ -fsanitize=thread -g -O1 program.cpp -o program

# 使用Helgrind (Valgrind)
valgrind --tool=helgrind ./program

4. 死锁预防解决方案

4.1 锁顺序策略

class LockHierarchy {
    // 定义锁的层次级别
    enum LockLevel {
        LEVEL_LOW = 1,
        LEVEL_MEDIUM = 2,
        LEVEL_HIGH = 3
    };
    
    class HierarchicalMutex {
        std::mutex internal_mutex;
        const LockLevel level;
        static thread_local LockLevel current_level;
        
    public:
        explicit HierarchicalMutex(LockLevel lvl) : level(lvl) {}
        
        void lock() {
            if (level >= current_level) {
                throw std::logic_error("Lock hierarchy violation!");
            }
            internal_mutex.lock();
            current_level = level;
        }
        
        void unlock() {
            current_level = LEVEL_HIGH;  // 重置到最高级别
            internal_mutex.unlock();
        }
    };
    
    HierarchicalMutex low_mutex{LEVEL_LOW};
    HierarchicalMutex medium_mutex{LEVEL_MEDIUM};
    HierarchicalMutex high_mutex{LEVEL_HIGH};
    
public:
    void correct_operation1() {
        std::lock_guard<HierarchicalMutex> lock1(high_mutex);    // 级别3
        std::lock_guard<HierarchicalMutex> lock2(medium_mutex);  // 级别2 - 允许
        std::lock_guard<HierarchicalMutex> lock3(low_mutex);     // 级别1 - 允许
        // 操作...
    }
    
    void incorrect_operation() {
        std::lock_guard<HierarchicalMutex> lock1(low_mutex);     // 级别1
        // 下一行会抛出异常，因为试图获取更高级别的锁
        // std::lock_guard<HierarchicalMutex> lock2(high_mutex); // 错误！
    }
};

thread_local LockHierarchy::LockLevel LockHierarchy::HierarchicalMutex::current_level = 
    LockHierarchy::LEVEL_HIGH;

4.2 同时获取多个锁

class SimultaneousLock {
    std::mutex mutex1, mutex2, mutex3;
    
public:
    // 方法1：使用std::lock同时获取多个锁
    void operation_with_std_lock() {
        std::unique_lock<std::mutex> lock1(mutex1, std::defer_lock);
        std::unique_lock<std::mutex> lock2(mutex2, std::defer_lock);
        std::unique_lock<std::mutex> lock3(mutex3, std::defer_lock);
        
        // 原子性地获取所有锁，避免死锁
        std::lock(lock1, lock2, lock3);
        
        // 所有锁都已获取，安全操作
        std::cout << "All locks acquired safely" << std::endl;
    }
    
    // 方法2：使用C++17的scoped_lock
    void operation_with_scoped_lock() {
        std::scoped_lock lock(mutex1, mutex2, mutex3);  // C++17
        // 自动同时获取所有锁
        std::cout << "All locks acquired with scoped_lock" << std::endl;
    }
    
    // 方法3：定义锁的获取顺序
    void operation_with_defined_order() {
        // 总是按照固定的顺序获取锁
        std::lock_guard<std::mutex> lock1(mutex1);
        std::lock_guard<std::mutex> lock2(mutex2);
        std::lock_guard<std::mutex> lock3(mutex3);
        
        std::cout << "Locks acquired in fixed order" << std::endl;
    }
};

4.3 超时和尝试锁

class TimeoutLock {
    std::timed_mutex mutex1, mutex2;
    
public:
    bool try_operation(std::chrono::milliseconds timeout) {
        // 尝试获取第一个锁
        std::unique_lock<std::timed_mutex> lock1(mutex1, std::defer_lock);
        if (!lock1.try_lock_for(timeout)) {
            std::cout << "Failed to acquire mutex1 within timeout" << std::endl;
            return false;
        }
        
        // 尝试获取第二个锁
        std::unique_lock<std::timed_mutex> lock2(mutex2, std::defer_lock);
        if (!lock2.try_lock_for(timeout)) {
            std::cout << "Failed to acquire mutex2 within timeout" << std::endl;
            // 自动释放lock1（RAII）
            return false;
        }
        
        // 成功获取两个锁
        perform_critical_work();
        return true;
    }
    
    bool try_all_locks_simultaneously() {
        // 尝试立即获取所有锁
        std::unique_lock<std::timed_mutex> lock1(mutex1, std::try_to_lock);
        std::unique_lock<std::timed_mutex> lock2(mutex2, std::try_to_lock);
        
        if (lock1.owns_lock() && lock2.owns_lock()) {
            perform_critical_work();
            return true;
        }
        
        // 如果无法获取所有锁，立即放弃
        std::cout << "Could not acquire all locks, backing off..." << std::endl;
        return false;
    }
    
private:
    void perform_critical_work() {
        std::cout << "Performing critical work..." << std::endl;
        std::this_thread::sleep_for(std::chrono::milliseconds(50));
    }
};

4.4 无锁编程

#include <atomic>

class LockFreeCounter {
    std::atomic<int> value{0};
    
public:
    void increment() {
        // 无锁操作，不会死锁
        value.fetch_add(1, std::memory_order_relaxed);
    }
    
    int get() const {
        return value.load(std::memory_order_acquire);
    }
    
    bool compare_and_swap(int expected, int new_value) {
        return value.compare_exchange_weak(expected, new_value,
                                         std::memory_order_acq_rel,
                                         std::memory_order_acquire);
    }
};

// 无锁队列示例
template<typename T>
class LockFreeQueue {
private:
    struct Node {
        T data;
        std::atomic<Node*> next;
        Node(const T& data) : data(data), next(nullptr) {}
    };
    
    std::atomic<Node*> head;
    std::atomic<Node*> tail;
    
public:
    LockFreeQueue() {
        Node* dummy = new Node(T{});
        head.store(dummy);
        tail.store(dummy);
    }
    
    ~LockFreeQueue() {
        while (Node* old_head = head.load()) {
            head.store(old_head->next);
            delete old_head;
        }
    }
    
    void push(const T& data) {
        Node* new_node = new Node(data);
        Node* old_tail = tail.load();
        
        while (true) {
            Node* next = old_tail->next.load();
            if (!next) {
                if (old_tail->next.compare_exchange_weak(next, new_node)) {
                    tail.compare_exchange_weak(old_tail, new_node);
                    return;
                }
            } else {
                tail.compare_exchange_weak(old_tail, next);
            }
        }
    }
    
    bool pop(T& result) {
        Node* old_head = head.load();
        while (true) {
            Node* next = old_head->next.load();
            if (!next) return false;
            
            if (head.compare_exchange_weak(old_head, next)) {
                result = std::move(next->data);
                delete old_head;
                return true;
            }
        }
    }
};

5. 死锁恢复策略

5.1 死锁检测和恢复

class DeadlockRecovery {
    std::mutex mutex1, mutex2;
    std::atomic<bool> shutdown_requested{false};
    
public:
    bool cooperative_operation_with_timeout() {
        auto start_time = std::chrono::steady_clock::now();
        const auto timeout = std::chrono::seconds(5);
        
        while (!shutdown_requested) {
            if (try_operation_with_backoff()) {
                return true;
            }
            
            if (std::chrono::steady_clock::now() - start_time > timeout) {
                std::cout << "Operation timeout, possible deadlock detected" << std::endl;
                recover_from_deadlock();
                return false;
            }
            
            std::this_thread::sleep_for(std::chrono::milliseconds(100));
        }
        
        return false;
    }
    
    void request_shutdown() {
        shutdown_requested = true;
    }
    
private:
    bool try_operation_with_backoff() {
        std::unique_lock<std::mutex> lock1(mutex1, std::try_to_lock);
        if (!lock1.owns_lock()) return false;
        
        std::unique_lock<std::mutex> lock2(mutex2, std::try_to_lock);
        if (!lock2.owns_lock()) return false;
        
        // 成功获取两个锁
        perform_work();
        return true;
    }
    
    void recover_from_deadlock() {
        std::cout << "Attempting deadlock recovery..." << std::endl;
        
        // 策略1：强制释放资源（危险，可能破坏一致性）
        // 策略2：回滚操作
        // 策略3：通知管理员或记录严重错误
        
        // 这里我们采用优雅降级
        perform_graceful_degradation();
    }
    
    void perform_work() {
        std::cout << "Performing normal work..." << std::endl;
    }
    
    void perform_graceful_degradation() {
        std::cout << "Performing degraded operation..." << std::endl;
        // 执行不需要所有锁的操作
    }
};

5.2 事务性内存

#include <vector>
#include <functional>

class TransactionalMemory {
    std::vector<std::function<void()>> operations;
    std::vector<std::function<void()>> rollback_ops;
    
public:
    template<typename Operation, typename Rollback>
    void add_operation(Operation&& op, Rollback&& rollback) {
        operations.emplace_back(std::forward<Operation>(op));
        rollback_ops.emplace_back(std::forward<Rollback>(rollback));
    }
    
    bool execute() {
        // 第一阶段：尝试执行所有操作
        std::vector<std::function<void()>> executed_ops;
        
        for (auto& op : operations) {
            try {
                op();
                executed_ops.push_back(op);
            } catch (...) {
                // 任何操作失败，回滚所有已执行的操作
                std::cout << "Operation failed, rolling back..." << std::endl;
                rollback_all(executed_ops);
                return false;
            }
        }
        
        return true;
    }
    
private:
    void rollback_all(const std::vector<std::function<void()>>& executed_ops) {
        // 按相反顺序回滚
        for (auto it = executed_ops.rbegin(); it != executed_ops.rend(); ++it) {
            try {
                // 查找对应的回滚操作
                auto index = std::distance(operations.begin(), 
                                         std::find(operations.begin(), 
                                                 operations.end(), *it));
                if (index < rollback_ops.size()) {
                    rollback_ops[index]();
                }
            } catch (...) {
                std::cerr << "Rollback operation failed" << std::endl;
            }
        }
    }
};

6. 最佳实践总结

6.1 设计原则

class DeadlockFreeDesign {
public:
    // 原则1：总是以相同的顺序获取锁
    void consistent_lock_order() {
        static std::mutex mutex_a, mutex_b, mutex_c;
        
        // 总是按照A->B->C的顺序
        std::lock_guard<std::mutex> lock_a(mutex_a);
        std::lock_guard<std::mutex> lock_b(mutex_b);
        std::lock_guard<std::mutex> lock_c(mutex_c);
        
        // 安全操作
    }
    
    // 原则2：使用RAII管理锁
    class RAIILockManager {
        std::unique_lock<std::mutex> lock;
        
    public:
        explicit RAIILockManager(std::mutex& m) : lock(m) {}
        // 自动管理锁的生命周期
    };
    
    // 原则3：避免在持有锁时调用未知代码
    void avoid_unknown_calls_while_locked() {
        std::mutex resource_mutex;
        
        {
            std::lock_guard<std::mutex> lock(resource_mutex);
            // 只执行已知安全的操作
            known_safe_operation();
            
            // 不要调用可能获取其他锁的函数
            // potentially_dangerous_operation();  // 避免！
        }
        
        // 现在锁已释放，可以调用任意代码
        potentially_dangerous_operation();
    }
    
    // 原则4：使用锁层次
    void use_lock_hierarchy() {
        // 如前所述的层次锁模式
    }
    
    // 原则5：优先使用无锁数据结构
    void prefer_lock_free() {
        std::atomic<int> counter{0};
        LockFreeQueue<int> queue;
        
        // 无锁操作不会死锁
        counter.fetch_add(1);
        queue.push(42);
    }

private:
    void known_safe_operation() {}
    void potentially_dangerous_operation() {}
};

6.2 代码审查清单

class DeadlockPreventionChecklist {
public:
    static void review_code() {
        std::cout << "Deadlock Prevention Checklist:" << std::endl;
        std::cout << "1. 所有锁是否按照固定顺序获取？" << std::endl;
        std::cout << "2. 是否避免了嵌套锁？" << std::endl;
        std::cout << "3. 是否使用RAII管理锁？" << std::endl;
        std::cout << "4. 锁的持有时间是否尽可能短？" << std::endl;
        std::cout << "5. 是否避免了在持有锁时调用虚函数？" << std::endl;
        std::cout << "6. 是否避免了在持有锁时分配内存？" << std::endl;
        std::cout << "7. 是否考虑了使用无锁数据结构？" << std::endl;
        std::cout << "8. 是否设置了合理的锁超时时间？" << std::endl;
    }
};

7. 测试死锁

7.1 死锁测试框架

#include <gtest/gtest.h>

class DeadlockTest : public ::testing::Test {
protected:
    template<typename SystemUnderTest>
    void stress_test_deadlock(SystemUnderTest& system, int num_threads, 
                             std::chrono::seconds duration) {
        std::atomic<bool> stop{false};
        std::vector<std::thread> threads;
        std::atomic<int> completed_operations{0};
        std::atomic<int> deadlock_count{0};
        
        auto start_time = std::chrono::steady_clock::now();
        
        for (int i = 0; i < num_threads; ++i) {
            threads.emplace_back([&, i] {
                while (!stop) {
                    if (std::chrono::steady_clock::now() - start_time > duration) {
                        break;
                    }
                    
                    try {
                        if (i % 2 == 0) {
                            system.operation_a();
                        } else {
                            system.operation_b();
                        }
                        completed_operations++;
                    } catch (const std::exception& e) {
                        std::cerr << "Exception: " << e.what() << std::endl;
                        deadlock_count++;
                    }
                    
                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                }
            });
        }
        
        std::this_thread::sleep_for(duration);
        stop = true;
        
        for (auto& t : threads) {
            if (t.joinable()) t.join();
        }
        
        std::cout << "Completed operations: " << completed_operations << std::endl;
        std::cout << "Potential deadlocks: " << deadlock_count << std::endl;
        
        ASSERT_GT(completed_operations, 0) << "System appears deadlocked!";
    }
};

TEST_F(DeadlockTest, BasicSystemTest) {
    BasicDeadlock system;
    stress_test_deadlock(system, 4, std::chrono::seconds(5));
}