死锁存在的条件
如果在一个系统中以下四个条件同时成立,那么就能引起死锁:
- 互斥:至少有一个资源必须处于非共享模式,即一次只有一个进程可使用。如果另一进程申请该资源,那么申请进程应等到该资源释放为止。
- 占有并等待:—个进程应占有至少一个资源,并等待另一个资源,而该资源为其他进程所占有。
- 非抢占:资源不能被抢占,即资源只能被进程在完成任务后自愿释放。
- 循环等待:有一组等待进程 {P0,P1,…,Pn},P0 等待的资源为 P1 占有,P1 等待的资源为 P2 占有,……,Pn-1 等待的资源为 Pn 占有,Pn 等待的资源为 P0 占有。
预防死锁
最好的方法:规定所有的锁只能按一定的顺序被获取 (破坏循环等待条件)
其他方法:
- 同时获取所有资源(效率低,要预先知道需要多少资源)
- 在获取锁失败是不进行等待,而是释放资源(局限性较大)
避免死锁
银行家算法(需要知道每个线程需要的最大资源)
死锁检测
分配图
有两种节点:P线程节点,R资源节点
每次线程P向资源R申请资源,如果R没有资源,则添加一条等待边(P->R)
当P得到资源R时,删除等待边(如果有),并立即添加一条分配边(R->P)
当释放资源时,删除分配边
注意操作图的时候要加锁
当且只当分配图有环时,存在死锁
代码实现
通过dlsym在pthread_mutex_lock和pthread_spinn_lock上挂勾子,在加锁解锁前后维护了分配图
新建一个监控线程,每格五秒检测图里是否有环(这里假设每个资源最多只有一个owner)
具体代码就不解释了吧
整个图需要的数据结构都保存在全局变量checker中
其中jshashmap(哈希表)和jsgraph(图)是我自己手写的简单数据结构,这里为了简洁就省略了
#define _GNU_SOURCE
#include <dlfcn.h>
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include "jshashmap.h"
#include "jsgraph.h"
#define THREAD_NUM 10
typedef unsigned long int uint64;
typedef int (*pthread_mutex_lock_t)(pthread_mutex_t *mutex);
pthread_mutex_lock_t pthread_mutex_lock_f;
typedef int (*pthread_mutex_unlock_t)(pthread_mutex_t *mutex);
pthread_mutex_unlock_t pthread_mutex_unlock_f;
typedef int (*pthread_spin_lock_t)(pthread_spinlock_t *spin);
pthread_spin_lock_t pthread_spin_lock_f;
typedef int (*pthread_spin_unlock_t)(pthread_spinlock_t *spin);
pthread_spin_unlock_t pthread_spin_unlock_f;
#define THREAD_VERTEX_START 0
#define THREAD_VERTEX_MAX (JS_GSIZE >> 1)
#define RESOURCE_VERTEX_START THREAD_VERTEX_MAX
#define RESOURCE_VERTEX_MAX JS_GSIZE
#define WAITING_PATH_MAX 512
struct {
pthread_spinlock_t lock;
//js_hashmap_t lock_owner; //lock addr => thread vertex_id
js_hashmap_t addr_vertex; //thread addr or lock addr => vertex_id
int next_thread_vid; //next thread vid to allocate
int next_lock_vid; //next lock vid to allocate
js_graph_t waiting_graph; //data: vertex_id => thread_addr
} checker;
void print_cycle(int *path, int start, int end) {
int i, vid;
printf("cycle detected: ");
for(i = start; i < end; i++) {
vid = path[i];
printf("[%c idx=%d id=%lu] -> ", (vid < THREAD_VERTEX_MAX) ? 'T' : 'L', vid, checker.waiting_graph.data[vid]);
}
printf("[%c idx=%d id=%lu]\n", 'T', start, checker.waiting_graph.data[vid]);
}
void check_dead_lock_dfs(int *visited, int *path, int npath, int cur) {
int i, next;
for(i = 0; i < npath; i++) {
if(path[i] == cur) {
print_cycle(path, i, npath);
return;
}
}
if (checker.waiting_graph.adj[cur]) {
next = checker.waiting_graph.adj[cur]->vertex_id;
if (!visited[next]) {
path[npath++] = cur;
if (npath == WAITING_PATH_MAX) {
printf("waiting path too long\n");
return;
}
check_dead_lock_dfs(visited, path, npath, next);
visited[next] = 1;
}
}
}
void check_dead_lock(void) {
static int visited[JS_GSIZE];
static int path[WAITING_PATH_MAX];
js_graph_print(&checker.waiting_graph);
memset(visited, 0, sizeof(int) * JS_GSIZE);
memset(path, 0, sizeof(int) * JS_GSIZE);
int i;
for(i = 0; i < JS_GSIZE; i++) {
if (!visited[i]) {
check_dead_lock_dfs(visited, path, 0, i);
visited[i] = 1;
}
}
}
static void *thread_routine(void *args) {
while (1) {
sleep(5);
check_dead_lock();
}
}
void init_checker() {
pthread_spin_init(&checker.lock, PTHREAD_PROCESS_PRIVATE);
js_hashmap_init(&checker.addr_vertex);
js_graph_init(&checker.waiting_graph, 0);
checker.next_thread_vid = THREAD_VERTEX_START;
checker.next_lock_vid = RESOURCE_VERTEX_START;
}
void start_check(void) {
pthread_t tid;
pthread_create(&tid, NULL, thread_routine, NULL);
}
//NOT thread safe
int checker_get_thread_vid(uint64 thread_id) {
js_hashmap_node_t *node = js_hashmap_get(&checker.addr_vertex, thread_id);
int vid;
if (!node) {
vid = checker.next_thread_vid++;
js_hashmap_put(&checker.addr_vertex, thread_id, vid);
checker.waiting_graph.data[vid] = (void*)thread_id;
} else {
vid = (int)node->v;
}
return vid;
}
//NOT thread safe
int checker_get_lock_vid(uint64 lockaddr) {
js_hashmap_node_t *node = js_hashmap_get(&checker.addr_vertex, lockaddr);
int vid;
if (!node) {
vid = checker.next_lock_vid++;
js_hashmap_put(&checker.addr_vertex, lockaddr, vid);
checker.waiting_graph.data[vid] = (void*)lockaddr;
} else {
vid = (int)node->v;
}
return vid;
}
//thread safe
void lock_before(uint64 thread_id, uint64 lockaddr) {
pthread_spin_lock_f(&checker.lock);
int t_vid = checker_get_thread_vid(thread_id);
int l_vid = checker_get_lock_vid(lockaddr);
js_graph_add_edge(&checker.waiting_graph, t_vid, l_vid); // thread => lock
pthread_spin_unlock_f(&checker.lock);
}
//thread_id just acquired a lock
void lock_after(uint64 thread_id, uint64 lockaddr) {
pthread_spin_lock_f(&checker.lock);
int t_vid = checker_get_thread_vid(thread_id);
int l_vid = checker_get_lock_vid(lockaddr);
//reverse edge
js_graph_erase_edge(&checker.waiting_graph, t_vid, l_vid);
js_graph_add_edge(&checker.waiting_graph, l_vid, t_vid); // lock => thread
pthread_spin_unlock_f(&checker.lock);
}
//remove lock from lock list if no one is waiting it
void unlock_after(uint64 thread_id, uint64 lockaddr) {
pthread_spin_lock_f(&checker.lock);
int t_vid = checker_get_thread_vid(thread_id);
int l_vid = checker_get_lock_vid(lockaddr);
//remove ownership
js_graph_erase_edge(&checker.waiting_graph, l_vid, t_vid);
pthread_spin_unlock_f(&checker.lock);
}
int pthread_mutex_lock(pthread_mutex_t *mutex) {
pthread_t selfid = pthread_self(); //
lock_before(selfid, (uint64)mutex);
pthread_mutex_lock_f(mutex);
lock_after(selfid, (uint64)mutex);
}
int pthread_mutex_unlock(pthread_mutex_t *mutex) {
pthread_t selfid = pthread_self();
pthread_mutex_unlock_f(mutex);
unlock_after(selfid, (uint64)mutex);
}
int pthread_spin_lock(pthread_spinlock_t *spin) {
pthread_t selfid = pthread_self(); //
lock_before(selfid, (uint64)spin);
pthread_spin_lock_f(spin);
lock_after(selfid, (uint64)spin);
}
int pthread_spin_unlock(pthread_spinlock_t *spin) {
pthread_t selfid = pthread_self();
pthread_spin_unlock_f(spin);
unlock_after(selfid, (uint64)spin);
}
static int init_hook() {
pthread_mutex_lock_f = dlsym(RTLD_NEXT, "pthread_mutex_lock");
pthread_mutex_unlock_f = dlsym(RTLD_NEXT, "pthread_mutex_unlock");
pthread_spin_lock_f = dlsym(RTLD_NEXT, "pthread_spin_lock");
pthread_spin_unlock_f = dlsym(RTLD_NEXT, "pthread_spin_unlock");
}
测试代码
pthread_mutex_t mutex_1 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex_2 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex_3 = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutex_4 = PTHREAD_MUTEX_INITIALIZER;
pthread_spinlock_t spin_1;
pthread_spinlock_t spin_2;
void *thread_rountine_1(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 1 : %ld \n", selfid);
pthread_mutex_lock(&mutex_1);
sleep(1);
pthread_mutex_lock(&mutex_2);
pthread_mutex_unlock(&mutex_2);
pthread_mutex_unlock(&mutex_1);
return (void *)(0);
}
void *thread_rountine_2(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 2 : %ld \n", selfid);
pthread_mutex_lock(&mutex_2);
sleep(1);
pthread_mutex_lock(&mutex_3);
pthread_mutex_unlock(&mutex_3);
pthread_mutex_unlock(&mutex_2);
return (void *)(0);
}
void *thread_rountine_3(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 3 : %ld \n", selfid);
pthread_mutex_lock(&mutex_3);
sleep(1);
pthread_mutex_lock(&mutex_4);
pthread_mutex_unlock(&mutex_4);
pthread_mutex_unlock(&mutex_3);
return (void *)(0);
}
void *thread_rountine_4(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 4 : %ld \n", selfid);
pthread_mutex_lock(&mutex_4);
sleep(1);
pthread_mutex_lock(&mutex_1);
pthread_mutex_unlock(&mutex_1);
pthread_mutex_unlock(&mutex_4);
return (void *)(0);
}
void *thread_rountine_5(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 5 : %ld \n", selfid);
pthread_spin_lock(&spin_1);
printf("thread_routine 5 get spin 1\n");
sleep(1);
pthread_spin_lock(&spin_2);
printf("thread_routine 5 get spin 2\n");
pthread_spin_unlock(&spin_2);
pthread_spin_unlock(&spin_1);
return (void *)(0);
}
void *thread_rountine_6(void *args)
{
pthread_t selfid = pthread_self(); //
printf("thread_routine 6 : %ld \n", selfid);
pthread_spin_lock(&spin_2);
printf("thread_routine 6 get spin 2\n");
sleep(1);
pthread_spin_lock(&spin_1);
printf("thread_routine 6 get spin 1\n");
pthread_spin_unlock(&spin_1);
pthread_spin_unlock(&spin_2);
return (void *)(0);
}
int main()
{
init_hook();
init_checker();
start_check();
printf("start_check\n");
pthread_spin_init(&spin_1, PTHREAD_PROCESS_PRIVATE);
pthread_spin_init(&spin_2, PTHREAD_PROCESS_PRIVATE);
pthread_t tid1, tid2, tid3, tid4, tid5, tid6;
pthread_create(&tid1, NULL, thread_rountine_1, NULL);
pthread_create(&tid2, NULL, thread_rountine_2, NULL);
pthread_create(&tid3, NULL, thread_rountine_3, NULL);
pthread_create(&tid4, NULL, thread_rountine_4, NULL);
pthread_create(&tid5, NULL, thread_rountine_5, NULL);
pthread_create(&tid6, NULL, thread_rountine_6, NULL);
pthread_join(tid1, NULL);
pthread_join(tid2, NULL);
pthread_join(tid3, NULL);
pthread_join(tid4, NULL);
pthread_join(tid5, NULL);
pthread_join(tid6, NULL);
return 0;
}
修复死锁
死锁是逻辑bug,完全修复死锁需要修改业务代码
临时补救方法(选一个,从简单到难):
- 把所有陷入死锁的线程都干掉
- 把所有陷入死锁的线程恢复到之前的一个checkpoint
- 把死锁进程一个个干掉直到不再死锁
- 把死锁进程的资源一个个释放直到不再死锁
参考资料
- 零声教育c/c++linux后台开发3.2.4
- https://blog.youkuaiyun.com/zhangpower1993/article/details/89518780#:~:text=%E6%AD%BB%E9%94%81%E9%81%BF%E5%85%8D%E6%98%AF%E5%88%A9%E7%94%A8,%E9%94%81%EF%BC%8C%E5%88%99%E6%8B%92%E7%BB%9D%E8%AF%A5%E7%94%B3%E8%AF%B7%E3%80%82
- http://c.biancheng.net/view/1236.html