一、问题描述
在浪潮ARM64+kylin系统上,使用公司的GPU与驱动,运行x11perf和glmark2一段时间后,出现桌面卡死。
二、问题分析
远程连接后,运行top命令发现Xorg的CPU利用率100%。推测Xorg可能出现死循环。使用gdb attach到Xorg,运行bt命令后有如下堆栈:
(gdb) bt
#0 0x0000007fa443817c in HashFind ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libdrm.so.2
#1 0x0000007fa443834c in drmHashLookup ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libdrm.so.2
#2 0x0000007fa30fa478 in xdxgpu_lookup_bo ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libdrm_xdxgpu.so.1
#3 0x0000007fa30f9d68 in xdxgpu_bo_import ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libdrm_xdxgpu.so.1
#4 0x0000007fa3280608 in ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libgsl_xdxgpu.so
#5 0x0000007fa3281acc in ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libgsl_xdxgpu.so
#6 0x0000007fa3282ae8 in ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libgsl_xdxgpu.so
#7 0x0000007fa32ae440 in PVRSRVAcquireCPUMapping ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libgsl_xdxgpu.so
#8 0x0000007fa32a83e4 in PVRSRVAcquireCPUMappingMIW ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libgsl_xdxgpu.so
#9 0x0000007fa2ff8e60 in ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libGLESv2_xdxgpu.so
#10 0x0000007fa2ff9db4 in glTexImage2D ()
at /usr/lib/aarch64-linux-gnu/xdxgpu/libGLESv2_xdxgpu.so
#11 0x0000007fa37fdabc in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#12 0x0000007fa38090cc in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#13 0x0000007fa380d038 in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#14 0x0000007fa380f5e4 in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#15 0x000000558e4e8200 in ()
#16 0x0000007fa37faf90 in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#17 0x0000007fa37fb03c in () at /usr/lib/xorg/modules/drivers/xdxgpu_drv.so
#18 0x000000558e4dde34 in ()
#19 0x000000558e40aef0 in ()
#20 0x000000558e40efdc in ()
#21 0x0000007fa3ffb110 in __libc_start_main ()
at /lib/aarch64-linux-gnu/libc.so.6
#22 0x000000558e3f89a0 in _start ()
因为UMD是release版本,无法单步跟踪,且该问题极难复现(只出现过这一次),所以通过分析汇编代码来debug该问题。
从backtrace可知当前正在函数HashFind中运行,多次运行并暂停后发现都是在该函数中运行(执行continue命令后过段时间再按ctrl+c,然后执行bt命令)。
从工程中查找函数HashFind的代码如下,该函数是从hash表中查找相应元素:
typedef struct HashBucket {
unsigned long key;
void *value;
struct HashBucket *next;
} HashBucket, *HashBucketPtr;
static HashBucketPtr HashFind(HashTablePtr table,
unsigned long key, unsigned long *h)
{
unsigned long hash = HashHash(key);
HashBucketPtr prev = NULL;
HashBucketPtr bucket;
if (h) *h = hash;
for (bucket = table->buckets[hash]; bucket; bucket = bucket->next) {
if (bucket->key == key) {
if (prev) {
/* Organize */
prev->next = bucket->next;
bucket->next = table->buckets[hash];
table->buckets[hash] = bucket;
++table->partials;
} else {
++table->hits;
}
return bucket;
}
prev = bucket;
}
++table->misses;
return NULL;
}
反汇编该函数得到HashFind的汇编代码如下:
(gdb) disassemble
Dump of assembler code for function HashFind:
0x0000007fa4438100 <+0>: stp x29, x30, [sp, #-80]!
0x0000007fa4438104 <+4>: mov x29, sp
0x0000007fa4438108 <+8>: stp x23, x24, [sp, #48]
0x0000007fa443810c <+12>: adrp x23, 0x7fa4451000 <memcpy@got.plt>
0x0000007fa4438110 <+16>: ldr w3, [x23, #1312]
0x0000007fa4438114 <+20>: stp x19, x20, [sp, #16]
0x0000007fa4438118 <+24>: mov x20, x1
0x0000007fa443811c <+28>: stp x21, x22, [sp, #32]
0x0000007fa4438120 <+32>: mov x21, x0
0x0000007fa4438124 <+36>: mov x22, x2
0x0000007fa4438128 <+40>: cbz w3, 0x7fa44381ec <HashFind+236>
0x0000007fa443812c <+44>: cbz x20, 0x7fa4438238 <HashFind+312>
0x0000007fa4438130 <+48>: add x1, x23, #0x520
0x0000007fa4438134 <+52>: mov x3, x20
0x0000007fa4438138 <+56>: add x1, x1, #0x10
0x0000007fa443813c <+60>: mov x4, #0x0 // #0
0x0000007fa4438140 <+64>: and x0, x3, #0xff
0x0000007fa4438144 <+68>: lsr x3, x3, #8
0x0000007fa4438148 <+72>: ldr x0, [x1, x0, lsl #3]
0x0000007fa443814c <+76>: add x4, x0, x4, lsl #1
0x0000007fa4438150 <+80>: cbnz x3, 0x7fa4438140 <HashFind+64>
0x0000007fa4438154 <+84>: and x4, x4, #0x1ff
0x0000007fa4438158 <+88>: cbz x22, 0x7fa4438160 <HashFind+96>
0x0000007fa443815c <+92>: str x4, [x22]
0x0000007fa4438160 <+96>: add x4, x21, x4, lsl #3
0x0000007fa4438164 <+100>: ldr x5, [x4, #40]
0x0000007fa4438168 <+104>: cbz x5, 0x7fa44381c8 <HashFind+200>
0x0000007fa443816c <+108>: mov x0, x5
0x0000007fa4438170 <+112>: mov x3, #0x0 // #0
0x0000007fa4438174 <+116>: b 0x7fa4438188 <HashFind+136>
0x0000007fa4438178 <+120>: ldr x2, [x0, #16] // ---- start for (bucket = table->buckets[hash]; bucket; bucket = bucket->next) {
0x0000007fa443817c <+124>: mov x3, x0
=> 0x0000007fa4438180 <+128>: mov x0, x2
0x0000007fa4438184 <+132>: cbz x2, 0x7fa44381c8 <HashFind+200>
0x0000007fa4438188 <+136>: ldr x1, [x0] //获取bucket->key
0x0000007fa443818c <+140>: cmp x1, x20 //x20等于key
0x0000007fa4438190 <+144>: b.ne 0x7fa4438178 <HashFind+120> // b.any ---- end
0x0000007fa4438194 <+148>: cbz x3, 0x7fa4438240 <HashFind+320>
0x0000007fa4438198 <+152>: ldr x1, [x0, #16]
0x0000007fa443819c <+156>: str x1, [x3, #16]
0x0000007fa44381a0 <+160>: str x5, [x0, #16]
0x0000007fa44381a4 <+164>: str x0, [x4, #40]
0x0000007fa44381a8 <+168>: ldp x19, x20, [sp, #16]
0x0000007fa44381ac <+172>: ldr x1, [x21, #24]
0x0000007fa44381b0 <+176>: ldp x23, x24, [sp, #48]
0x0000007fa44381b4 <+180>: add x1, x1, #0x1
0x0000007fa44381b8 <+184>: str x1, [x21, #24]
0x0000007fa44381bc <+188>: ldp x21, x22, [sp, #32]
0x0000007fa44381c0 <+192>: ldp x29, x30, [sp], #80
0x0000007fa44381c4 <+196>: ret
0x0000007fa44381c8 <+200>: ldr x1, [x21, #32]
0x0000007fa44381cc <+204>: mov x0, #0x0 // #0
0x0000007fa44381d0 <+208>: ldp x19, x20, [sp, #16]
0x0000007fa44381d4 <+212>: add x1, x1, #0x1
0x0000007fa44381d8 <+216>: str x1, [x21, #32]
0x0000007fa44381dc <+220>: ldp x21, x22, [sp, #32]
0x0000007fa44381e0 <+224>: ldp x23, x24, [sp, #48]
0x0000007fa44381e4 <+228>: ldp x29, x30, [sp], #80
0x0000007fa44381e8 <+232>: ret
0x0000007fa44381ec <+236>: add x24, x23, #0x520
0x0000007fa44381f0 <+240>: mov x0, #0x25 // #37
0x0000007fa44381f4 <+244>: add x19, x24, #0x10
0x0000007fa44381f8 <+248>: add x24, x24, #0x810
0x0000007fa44381fc <+252>: str x25, [sp, #64]
0x0000007fa4438200 <+256>: bl 0x7fa44309b0 <drmRandomCreate@plt>
0x0000007fa4438204 <+260>: mov x25, x0
0x0000007fa4438208 <+264>: mov x0, x25
0x0000007fa443820c <+268>: bl 0x7fa4430b40 <drmRandom@plt>
0x0000007fa4438210 <+272>: str x0, [x19], #8
0x0000007fa4438214 <+276>: cmp x24, x19
0x0000007fa4438218 <+280>: b.ne 0x7fa4438208 <HashFind+264> // b.any
0x0000007fa443821c <+284>: mov x0, x25
0x0000007fa4438220 <+288>: bl 0x7fa4430930 <drmRandomDestroy@plt>
0x0000007fa4438224 <+292>: ldr w0, [x23, #1312]
0x0000007fa4438228 <+296>: ldr x25, [sp, #64]
0x0000007fa443822c <+300>: add w0, w0, #0x1
0x0000007fa4438230 <+304>: str w0, [x23, #1312]
0x0000007fa4438234 <+308>: cbnz x20, 0x7fa4438130 <HashFind+48>
0x0000007fa4438238 <+312>: mov x4, #0x0 // #0
0x0000007fa443823c <+316>: b 0x7fa4438158 <HashFind+88>
0x0000007fa4438240 <+320>: ldr x1, [x21, #16]
0x0000007fa4438244 <+324>: ldp x19, x20, [sp, #16]
0x0000007fa4438248 <+328>: add x1, x1, #0x1
0x0000007fa443824c <+332>: str x1, [x21, #16]
0x0000007fa4438250 <+336>: ldp x21, x22, [sp, #32]
0x0000007fa4438254 <+340>: ldp x23, x24, [sp, #48]
0x0000007fa4438258 <+344>: ldp x29, x30, [sp], #80
0x0000007fa443825c <+348>: ret
多次运行nexti指令,发现一直在第33行到第39行汇编代码循环运行,怀疑for循环出现了死循环。分析汇编语句如下:
-
ldr x2, [x0, #16],从内存地址x0+16取数据,并将取出的数据放入寄存器x2,而HashBucket的next成员相对于结构体起始地址偏移刚好16个字节,所以推测该语句对应C代码for循环的bucket = bucket->next。
-
mov x0, x2, 将bucket->next放入寄存器x0。
-
cbz x2, 0x7fa44381c8 <HashFind+200>,判断x2是否为0,如果为0则跳转到HashFind+200,刚好对应for循环判断bucket是否为NULL,如果为NULL则退出for循环。
-
ldr x1, [x0] 将bucket->next指向的地址的数据(即HashBucket的key成员)取出来,并放入寄存器x1。
-
cmp x1, x20,比较x1(bucket->next->key)与x20(参数key)是否相等,对应C代码 if (bucket->key == key)。
-
b.ne 0x7fa4438178 <HashFind+120>,如果不相等,则跳转到HashFind+120即第1步的指令ldr x2, [x0, #16]循环运行。
根据以上分析,这段汇编代码与for循环能够匹配上。分析代码发现table->buckets[hash]是个单向链表,如果for循环是死循环,那table->buckets[hash]必然变成了循环链表。
为了查看table->buckets[hash]是否是循环链表,我们将所有的bucket、bucket->next值打印出来,看是否为循环链表。
当运行地址为0x0000007fa443817c的指令时,输入info reg查看寄存器值如下:
(gdb) nexti
0x0000007fa443817c in HashFind ()
from /usr/lib/aarch64-linux-gnu/xdxgpu/libdrm.so.2
(gdb) info reg
x0 0x55b54b0010 368113811472
x1 0x0 0
x2 0x55b5c1e310 368121602832
x3 0x7fa4145c80 548213644416
x4 0x55b40a2dd8 368092786136
x5 0x55b454b910 368097671440
x6 0x17f 383
x7 0x0 0
x8 0x1d 29
x9 0x7fa46c2660 548219397728
x10 0x0 0
x11 0x0 0
x12 0x6d756e5f70694d20 7887331678596189472
x13 0x3d657a695320313d 4424076801746153789
x14 0xffffffffffffffff -1
x15 0x60 96
x16 0x7fa310e080 548196638848
x17 0x7fa4438308 548216734472
x18 0x1 1
x19 0x7fc6c61290 548795716240
x20 0xa1 161
x21 0x55b40a2710 368092784400
x22 0x0 0
x23 0x7fa4451000 548216836096
x24 0x7fc6c61388 548795716488
x25 0x7fa30fc320 548196565792
x26 0x55b58be420 368118064160
x27 0xa1 161
x28 0x55b461b470 368098522224
x29 0x7fc6c61200 548795716096
x30 0x7fa443834c 548216734540
sp 0x7fc6c61200 0x7fc6c61200
pc 0x7fa443817c 0x7fa443817c <HashFind+124>
cpsr 0x80200000 [ EL=0 SS N ]
fpsr 0x11 17
fpcr 0x0 0
因为前一句指令是ldr x2, [x0, #16],所以x0的值就是bucket指针值,地址x0+0x10的数据就是bucket->next,获取各个bucket地址得到如下结果,最终判断为循环链表,推测对链表操作时没有做好互斥:
(gdb) x/xg 0x55b54b0020 将地址x0+0x10 == 0x55b54b0010+0x10 == 0x55b54b0020的数据(即bucket->next)打印出来。
0x55b54b0020: 0x00000055b5c1e310
(gdb) x/xg 0x00000055b5c1e320 将地址0x00000055b5c1e310+0x10 == 0x00000055b5c1e320的数据(即bucket->next->next)打印出来,后面以此类推。
0x55b5c1e320: 0x00000055b4400040
(gdb) x/xg 0x00000055b4400050
0x55b4400050: 0x00000055b5884d00
(gdb) x/xg 0x00000055b5884d10
0x55b5884d10: 0x00000055b586f510
(gdb) x/xg 0x00000055b586f520
0x55b586f520: 0x00000055b59942e0
(gdb) x/xg 0x00000055b59942f0
0x55b59942f0: 0x00000055b59ad3b0
(gdb) x/xg 0x00000055b59ad3c0
0x55b59ad3c0: 0x00000055b5849400
(gdb) x/xg 0x00000055b5849410
0x55b5849410: 0x00000055b58b18a0
(gdb) x/xg 0x00000055b58b18b0
0x55b58b18b0: 0x00000055b584e510
(gdb) x/xg 0x00000055b584e520
0x55b584e520: 0x00000055b59a4850
(gdb) x/xg 0x00000055b59a4860
0x55b59a4860: 0x00000055b58eba00
(gdb) x/xg 0x00000055b58eba10
0x55b58eba10: 0x00000055b58a2f20
(gdb) x/xg 0x00000055b58a2f30
0x55b58a2f30: 0x00000055b5c21e40
(gdb) x/xg 0x00000055b5c21e50
0x55b5c21e50: 0x00000055b5878620
(gdb) x/xg 0x00000055b5878630
0x55b5878630: 0x00000055b58ac970
(gdb) x/xg 0x00000055b58ac980
0x55b58ac980: 0x00000055b5c3f730
(gdb) x/xg 0x00000055b5c3f740
0x55b5c3f740: 0x00000055b5870210
(gdb) x/xg 0x00000055b5870220
0x55b5870220: 0x00000055b583a120
(gdb) x/xg 0x00000055b583a130
0x55b583a130: 0x00000055b59ab040
(gdb) x/xg 0x00000055b59ab050
0x55b59ab050: 0x00000055b59c8d70
(gdb) x/xg 0x00000055b59c8d80
0x55b59c8d80: 0x00000055b4542830
(gdb) x/xg 0x00000055b4542840
0x55b4542840: 0x00000055b5808820
(gdb) x/xg 0x00000055b5808830
0x55b5808830: 0x0000007fa4145c80
(gdb) x/xg 0x0000007fa4145c90
0x7fa4145c90: 0x00000055b54b0010
(gdb) x/xg 0x00000055b54b0020
0x55b54b0020: 0x00000055b5c1e310 该值等于bucket->next,即形成了循环链表。推测对链表操作时没有做好互斥。
对hash表的链表操作的代码如下:
static void xdxgpu_bo_free(struct xdxgpu_bo *xbo)
{
struct xdxgpu_device *xdev = xbo->xdev;
assert(xbo != NULL);
drmHashDelete(xdev->bo_table, xbo->gem_handle);
if (xbo->ptr)
drm_munmap(xbo->ptr, xbo->size);
drmCloseBufferHandle(xdev->fd, xbo->gem_handle);
xdxgpu_device_put(xdev);
free(xbo);
}
drm_public int xdxgpu_bo_create(xdxgpu_handle dev,
struct xdxgpu_arg_bo_create *arg,
xdxgpu_handle *bo)
{
......
if (!(arg->property & XDXGPU_BO_PROPERTY_VPU_BIT)) {
atomic_set(&xbo->refcount, 1);
pthread_mutex_lock(&xdev->bo_tlb_lock);
drmHashInsert(xdev->bo_table, xbo->gem_handle, xbo);
pthread_mutex_unlock(&xdev->bo_tlb_lock);
*bo = (xdxgpu_handle)xbo;
}
return 0;
}
static int xdxgpu_bo_import_from_dmabuf_fd(struct xdxgpu_device *xdev, int fd,
struct xdxgpu_bo **ppxbo)
{
......
pthread_mutex_lock(&xdev->bo_tlb_lock);
xbo = xdxgpu_lookup_bo(xdev->bo_table, gem_handle);
pthread_mutex_unlock(&xdev->bo_tlb_lock);
.....
}
static int xdxgpu_bo_import_from_pvr_handle(struct xdxgpu_device *xdev,
uint32_t handle,
struct xdxgpu_bo **ppxbo)
{
.......
pthread_mutex_lock(&xdev->bo_tlb_lock);
xbo = xdxgpu_lookup_bo(xdev->bo_table, gem_handle);
pthread_mutex_unlock(&xdev->bo_tlb_lock);
......
pthread_mutex_lock(&xdev->bo_tlb_lock);
drmHashInsert(xdev->bo_table, xbo->gem_handle, xbo);
pthread_mutex_unlock(&xdev->bo_tlb_lock);
......
}
drm_private struct xdxgpu_bo* xdxgpu_lookup_bo(void *tbl, uint32_t key)
{
struct xdxgpu_bo *xbo = NULL;
if (!drmHashLookup(tbl, key, (void **)&xbo))
xdxgpu_bo_get(xbo);
return xbo;
}
发现函数xdxgpu_bo_free对hash表操作时未加锁,推测某线程从hash表的链表中删除节点时未加锁,同时其他线程也对该链表进行操作,将单线链表整成了循环链表。
三、验证及修改
尝试复现该问题,发现根本无法复现,为了进行验证,将hash表的操作移植到主机上,然后写测试用例进行复现,测试代码总共有三个线程分别是:删除线程、插入线程、查找线程,分别对hash表的链表进行删除、插入、查找操作,但是删除操作未加锁,其他操作均加锁,为了加快复现,将hash表的buckets元素个数设置为1(即将宏HASH_SIZE设置为1,原代码为512),即hash表只有一个链表,所有操作都会对同一个链表进行。运行测试程序后,用top命令查看,一旦链表变为循环链表查找线程CPU占用率必然为100%(如果不是循环链表,因为有usleep所以CPU占用率不会100%).
运行后立即就出现了CPU占用率100%的线程,gdb调试后发现已经变为循环链表(测试代码编译为debug 版本,方便直接打印bucket地址).
测试代码如下:
----------------------------xf86drmHash.h----------------------------
#define HASH_SIZE 1 /* Good for about 100 entries */
/* If you change this value, you probably
have to change the HashHash hashing
function! */
typedef struct HashBucket {
unsigned long key;
void *value;
struct HashBucket *next;
} HashBucket, *HashBucketPtr;
typedef struct HashTable {
unsigned long magic;
unsigned long entries;
unsigned long hits; /* At top of linked list */
unsigned long partials; /* Not at top of linked list */
unsigned long misses; /* Not in table */
HashBucketPtr buckets[HASH_SIZE];
int p0;
HashBucketPtr p1;
} HashTable, *HashTablePtr;
----------------------------main.c----------------------------
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include "xf86drmHash.h"
#include<pthread.h>
#define HASH_MAGIC 0xdeadbeef
#define RANDOM_MAGIC 0xfeedbeef
# define drm_private
# define drm_public
typedef struct RandomState {
unsigned long magic;
unsigned long a;
unsigned long m;
unsigned long q; /* m div a */
unsigned long r; /* m mod a */
unsigned long check;
unsigned long seed;
} RandomState;
void *drmMalloc(int size)
{
return calloc(1, size);
}
void drmFree(void *pt)
{
free(pt);
}
drm_public unsigned long drmRandom(void *state)
{
RandomState *s = (RandomState *)state;
unsigned long hi;
unsigned long lo;
hi = s->seed / s->q;
lo = s->seed % s->q;
s->seed = s->a * lo - s->r * hi;
if ((s->a * lo) <= (s->r * hi)) s->seed += s->m;
return s->seed;
}
drm_public int drmRandomDestroy(void *state)
{
drmFree(state);
return 0;
}
drm_public void *drmRandomCreate(unsigned long seed)
{
RandomState *state;
state = drmMalloc(sizeof(*state));
if (!state) return NULL;
state->magic = RANDOM_MAGIC;
#if 0
/* Park & Miller, October 1988 */
state->a = 16807;
state->m = 2147483647;
state->check = 1043618065; /* After 10000 iterations */
#else
/* Park, Miller, and Stockmeyer, July 1993 */
state->a = 48271;
state->m = 2147483647;
state->check = 399268537; /* After 10000 iterations */
#endif
state->q = state->m / state->a;
state->r = state->m % state->a;
state->seed = seed;
/* Check for illegal boundary conditions,
and choose closest legal value. */
if (state->seed <= 0) state->seed = 1;
if (state->seed >= state->m) state->seed = state->m - 1;
return state;
}
static unsigned long HashHash(unsigned long key)
{
unsigned long hash = 0;
unsigned long tmp = key;
static int init = 0;
static unsigned long scatter[256];
int i;
if (!init) {
void *state;
state = drmRandomCreate(37);
for (i = 0; i < 256; i++) scatter[i] = drmRandom(state);
drmRandomDestroy(state);
++init;
}
while (tmp) {
hash = (hash << 1) + scatter[tmp & 0xff];
tmp >>= 8;
}
hash %= HASH_SIZE;
return hash;
}
drm_public void *drmHashCreate(void)
{
HashTablePtr table;
table = drmMalloc(sizeof(*table));
if (!table) return NULL;
table->magic = HASH_MAGIC;
return table;
}
drm_public int drmHashDestroy(void *t)
{
HashTablePtr table = (HashTablePtr)t;
HashBucketPtr bucket;
HashBucketPtr next;
int i;
if (table->magic != HASH_MAGIC) return -1; /* Bad magic */
for (i = 0; i < HASH_SIZE; i++) {
for (bucket = table->buckets[i]; bucket;) {
next = bucket->next;
drmFree(bucket);
bucket = next;
}
}
drmFree(table);
return 0;
}
/* Find the bucket and organize the list so that this bucket is at the
top. */
static HashBucketPtr HashFind(HashTablePtr table,
unsigned long key, unsigned long *h)
{
unsigned long hash = HashHash(key);
HashBucketPtr prev = NULL;
HashBucketPtr bucket;
if (h) *h = hash;
for (bucket = table->buckets[hash]; bucket; bucket = bucket->next) {
if (bucket->key == key) {
if (prev) {
/* Organize */
prev->next = bucket->next;
bucket->next = table->buckets[hash];
table->buckets[hash] = bucket;
++table->partials;
} else {
++table->hits;
}
return bucket;
}
prev = bucket;
}
++table->misses;
return NULL;
}
drm_public int drmHashLookup(void *t, unsigned long key, void **value)
{
HashTablePtr table = (HashTablePtr)t;
HashBucketPtr bucket;
if (!table || table->magic != HASH_MAGIC) return -1; /* Bad magic */
bucket = HashFind(table, key, NULL);
if (!bucket) return 1; /* Not found */
*value = bucket->value;
return 0; /* Found */
}
drm_public int drmHashInsert(void *t, unsigned long key, void *value)
{
HashTablePtr table = (HashTablePtr)t;
HashBucketPtr bucket;
unsigned long hash;
if (table->magic != HASH_MAGIC) return -1; /* Bad magic */
if (HashFind(table, key, &hash)) return 1; /* Already in table */
bucket = drmMalloc(sizeof(*bucket));
if (!bucket) return -1; /* Error */
bucket->key = key;
bucket->value = value;
bucket->next = table->buckets[hash];
table->buckets[hash] = bucket;
return 0; /* Added to table */
}
drm_public int drmHashDelete(void *t, unsigned long key)
{
HashTablePtr table = (HashTablePtr)t;
unsigned long hash;
HashBucketPtr bucket;
if (table->magic != HASH_MAGIC) return -1; /* Bad magic */
bucket = HashFind(table, key, &hash);
if (!bucket) return 1; /* Not found */
table->buckets[hash] = bucket->next;
drmFree(bucket);
return 0;
}
drm_public int drmHashNext(void *t, unsigned long *key, void **value)
{
HashTablePtr table = (HashTablePtr)t;
while (table->p0 < HASH_SIZE) {
if (table->p1) {
*key = table->p1->key;
*value = table->p1->value;
table->p1 = table->p1->next;
return 1;
}
table->p1 = table->buckets[table->p0];
++table->p0;
}
return 0;
}
drm_public int drmHashFirst(void *t, unsigned long *key, void **value)
{
HashTablePtr table = (HashTablePtr)t;
if (table->magic != HASH_MAGIC) return -1; /* Bad magic */
table->p0 = 0;
table->p1 = table->buckets[0];
return drmHashNext(table, key, value);
}
pthread_mutex_t g_lock;
void *g_tlb;
int g_key_start = 1;
int g_key_end = 100;
void *find_fun(void *arg)
{
int i = 0;
printf("find thread: %d\n", gettid());
while(1)
{
void *value;
printf("find thread run!\n");
for (i = g_key_start; i < g_key_end; i ++) {
printf("find thread run1!\n");
pthread_mutex_lock(&g_lock);
printf("find thread run2!\n");
drmHashLookup(g_tlb, i, (void **)&value);
printf("find thread run3!\n");
pthread_mutex_unlock(&g_lock);
printf("find thread run4!\n");
}
usleep(100);
}
printf("find thread exit!\n");
return NULL;
}
void *del_fun(void *arg)
{
int i = 0;
printf("delete thread: %d\n", gettid());
while (1)
{
for (i = g_key_start; i < g_key_end; i ++) {
pthread_mutex_lock(&g_lock);
drmHashDelete(g_tlb, i);
pthread_mutex_unlock(&g_lock);
}
usleep(100);
}
printf("delete thread exit!\n");
return NULL;
}
int main()
{
pthread_t find_tid;
pthread_t del_tid;
int i = 0;
pthread_mutex_init(&g_lock, NULL);
g_tlb = drmHashCreate();
pthread_create(&find_tid, NULL, find_fun, NULL);
pthread_create(&del_tid, NULL, del_fun, NULL);
printf("insert thread: %d\n", gettid());
while (1)
{
for (i = g_key_start; i < g_key_end; i ++)
{
void *value = (void *)i;
pthread_mutex_lock(&g_lock);
drmHashInsert(g_tlb, i, value);
pthread_mutex_unlock(&g_lock);
}
usleep(100);
}
printf("insert thread exit!\n");
return 0;
}
给测试代码的删除操作加上锁后,不再出现问题.
库上代码修改如下:
static void xdxgpu_bo_free(struct xdxgpu_bo *xbo)
{
struct xdxgpu_device *xdev = xbo->xdev;
assert(xbo != NULL);
+ pthread_mutex_lock(&xdev->bo_tlb_lock);
drmHashDelete(xdev->bo_table, xbo->gem_handle);
+ pthread_mutex_unlock(&xdev->bo_tlb_lock);
if (xbo->ptr)
drm_munmap(xbo->ptr, xbo->size);
drmCloseBufferHandle(xdev->fd, xbo->gem_handle);
xdxgpu_device_put(xdev);
free(xbo);
}
四、总结
因为hash表有512个buckets,相当于hash表有512个链表,多个线程同时对同一个链表进行删除和其他操作的概率非常低,所以很难复现。
有些bug复现概率非常低,所以第一现场特别重要,要用尽可能多的方法进行排查,该bug就是在第一现场就定位,后面尝试复现再未出现过,故在本地写代码进行复现验证。