目录
KASAN是一个动态检测内存错误的工具。KASAN可以检测全局变量、栈、堆分配的内存发生越界访问 "out-of-bounds" 和 释放后访问 "use-after-free"等问题,但是不能解决内存泄漏问题。
1、整体思路
a、内核支持 kasan 配置项
b、内核打开调试信息,增加-g编译参数,为了可以使用 addr2line 工具越界访问的代码行
c、分析kasan的打印日志定位信息,kasan的日志信息主要分为4部分
- 1) 读或写异常访问的栈回溯信息。
- 2) 数据结构空间分配的堆栈信息。
- 3) 数据结构内存释放的内存信息。
- 4) 异常访问地址附近内存的可用状态。
d、根据kasan的堆栈打印,定位到具体的代码行,分析问题。
2、内核支持KASAN功能,打开调试信息
a、内核支持 kasan 配置项,当出现越界访问时会打印kasan的报告日志。
CONFIG_SLUB_DEBUG=y
CONFIG_KASAN=y
b、内核打开调试信息,增加-g编译参数,编译器优化等级改为 -O0 ,为了可以使用 addr2line 工具越界访问的代码行。
linux/src/Makefile 文件,搜索编译参数 KBUILD_CFLAGS,在编译参数上增加 -g 选项。KBUILD_CFLAGS := -DCONFIG_DH_MODIFY -Wall -g -Wundef -Werror=strict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE \
-Werror=implicit-function-declaration -Werror=implicit-int \
-Werror=return-type -Wno-format-security \
-std=gnu89 -fstack-protector-all -z noexecstack
c、注意:开启上面选项之后所有的驱动模块需要重新编译,不然驱动模块可能无法正常执行。
3、 使用 addr2line 工具定位到异常代码行
以防火墙模块为例:[ 131.412368] memory_init+0x124/0x2ec [netctrl]
memory_init 函数大小为 0x2ec 字节,代码执行到偏移 0x124 出异常,我们需要定位 0x124 对应哪一行代码。
a、从ko模块中获取 memory_init 函数的起始地址,有三种方式,nm、readelf、objdump工具
这里只介绍 nm 和 readelf 两种工具。
[user]$ nm netctrl.ko | grep memory_init
000000000000107c T memory_init
[user]$ readelf -s netctrl.ko | grep memory_init
162: 000000000000107c 748 FUNC GLOBAL DEFAULT 1 memory_init
b、根据 0x124/0x2ec 偏移地址定位到代码行地址
memory_init 函数起始地址为 000000000000107c
memory_init+0x124/0x2ec [netctrl]异常代码对应的地址为:0x11A0 = 0x000000000000107c + 0x124
c、使用addr2line工具找到异常代码所在文件名和行号
memory_init 函数源码为:
int memory_init(void)
{
V_TEMP_LISTS *pTempV = NULL;/*临时白名单*/
pIpv4 = (V4_LISTS*)kmalloc(g_max_list_num * sizeof(V4_LISTS), GFP_KERNEL);
if (!pIpv4)
goto __error;
memset(pIpv4, 0, g_max_list_num * sizeof(V4_LISTS));
pIpv6 = (V6_LISTS*)kmalloc(g_max_list_num * sizeof(V6_LISTS), GFP_KERNEL);
if (!pIpv6)
goto __error;
/netfireware/comm.c:68 对应行: memset(pIpv6, 0, g_max_list_num * sizeof(V6_LISTS));
/* addr2line工具定位为此行,确定是此行分配了 pTempV 的空间, g_max_list_num 为64 */
pTempV = (V_TEMP_LISTS*)kmalloc(g_max_list_num * sizeof(V_TEMP_LISTS), GFP_KERNEL);
if (!pTempV)
goto __error;
memset(pTempV, 0, g_max_list_num * sizeof(V_TEMP_LISTS));
pIcmpV = (V_ICMP_LISTS*)kmalloc(g_max_list_num * sizeof(V_ICMP_LISTS), GFP_KERNEL);
if (!pIcmpV)
goto __error;
memset(pIcmpV, 0, g_max_list_num * sizeof(V_ICMP_LISTS));
pMac = (MAC_LISTS*)kmalloc(g_max_list_num * sizeof(MAC_LISTS), GFP_KERNEL);
if (!pMac)
goto __error;
memset(pMac, 0, g_max_list_num * sizeof(MAC_LISTS));
return 0;
}
aarch64-linux-gnu-rk3588-v1-addr2line -C -f -e netctrl.ko 0x11a0 memory_init
[user]$ aarch64-linux-gnu-rk3588-v1-addr2line -C -f -e netctrl.ko 0x11a0 memory_init
kmalloc
/home/233410/d3u3588/rklinux/src/./include/linux/slab.h:557
上面可以定位到是 memory_init 函数中的 kmalloc 行,但是 memory_init 函数有多个 kmalloc;我们希望定位具体哪个 kmalloc 函数;找 0x11a0 地址前24字节地址即可。
[user]$ aarch64-linux-gnu-rk3588-v1-addr2line -C -f -e netctrl.ko 0x1188 memory_init
memory_init
/home/233410/d3u3588/netfireware/comm.c:68
4、读数据越界访问案例分析
4.1、越界访问源码分析
int temp_inside(unsigned char* dip, unsigned short port, int version, int isOut)
{
int i = 0;
......................................................................
/* g_max_list_num 为64,i最大只能到63,进入循环执行后,再执行i++此时i等于64;退出循环 */
for (i = 0; i < g_max_list_num; i++)
{
if (pTempV[i].seconds && (jiffies/HZ - pTempV[i].seconds > 3599))
{
memset(&pTempV[i], 0, sizeof(V_TEMP_LISTS));
}
}
......................................................................
if (isOut) /* isOut 入参为 0 */
pTempV[i].out_drop++;
else
/* 执行完循环后,i为64,pTempV数组只有64个元素,所以i=64会出现越界访问 */
pTempV[i].in_drop++;
return 0;
}
4.2、读越界kasan报告分析
[ 131.411886] ==================================================================
/* 越界访问出现在 [netctrl] 模块 temp_inside 函数的 +0x278/0x2a0 偏移位置 */
[ 131.411924] BUG: KASAN: slab-out-of-bounds in temp_inside+0x278/0x2a0 [netctrl]
/* 越界访问是读8字节数据时出现的,具体地址在 ffffff810cffee18 ,异常访问是 UPDATASOCKET 任务,pid = 1288 触发的 */
[ 131.411934] Read of size 8 at addr ffffff810cffee18 by task UPDATASOCKET/1288
[ 131.411941]
[ 131.411951] CPU: 2 PID: 1288 Comm: UPDATASOCKET Tainted: P O 5.10.66 #1
[ 131.411959] Hardware name: Rockchip RK3588 NVR DEMO LP4 V10 Board (DT)
/* 异常访问的堆栈信息,定位异常访问的代码行:[ 131.412037] temp_inside+0x278/0x2a0 [netctrl]
nm netctrl.ko | grep temp_inside
[233410@yanfa219_ubuntu18-jk128:weops ckms]$ nm netctrl.ko | grep temp_inside
0000000000001a4c T temp_inside
具体异常访问代码地址: 0000000000001CC4 = 0000000000001a4c + 0x278
aarch64-linux-gnu-rk3588-v1-addr2line -C -f -e netctrl.ko 0x1cc4 temp_inside
[233410@yanfa219_ubuntu18-jk128:weops ckms]$ aarch64-linux-gnu-rk3588-v1-addr2line -C -f -e netctrl.ko 0x1cc4 temp_inside
temp_inside
/home/233410/d3u3588/netfireware/comm.c:363
找到异常访问代码行,进行分析即可。
*/
[ 131.411966] Call trace:
[ 131.411978] dump_backtrace+0x0/0x2b8
[ 131.411986] show_stack+0x24/0x30
[ 131.411997] dump_stack_lvl+0x108/0x14c
[ 131.412008] print_address_description.constprop.0+0x38/0x280
[ 131.412017] kasan_report+0x14c/0x1f0
[ 131.412026] __asan_load8+0x3c/0xa8
[ 131.412037] temp_inside+0x278/0x2a0 [netctrl]
[ 131.412048] hook_ipv6_in+0x168/0x1d4 [netctrl]
[ 131.412058] nf_hook_slow+0x7c/0xec
[ 131.412068] NF_HOOK.constprop.0+0xf0/0x150
[ 131.412076] ipv6_rcv+0x70/0x8c
[ 131.412085] __netif_receive_skb_one_core+0xe8/0x130
[ 131.412094] __netif_receive_skb+0xac/0xb4
[ 131.412102] process_backlog+0x10c/0x218
[ 131.412110] net_rx_action+0x23c/0x4c0
[ 131.412118] __do_softirq+0x308/0x404
[ 131.412127] do_softirq+0x64/0x80
[ 131.412137] netif_rx_ni+0xc4/0x14c
[ 131.412146] dev_loopback_xmit+0xb0/0xc8
[ 131.412154] NF_HOOK.constprop.0+0x100/0x160
[ 131.412163] ip6_finish_output2+0x2ac/0x8d8
[ 131.412171] __ip6_finish_output+0x218/0x21c
[ 131.412179] ip6_output+0x190/0x218
[ 131.412188] dst_output+0x4c/0x60
[ 131.412197] ip6_local_out+0x48/0x5c
[ 131.412205] ip6_send_skb+0x58/0xe8
[ 131.412214] udp_v6_send_skb+0x3c0/0x5c8
[ 131.412223] udpv6_sendmsg+0x910/0xb88
[ 131.412231] inet6_sendmsg+0x6c/0x88
[ 131.412241] sock_sendmsg_nosec+0x4c/0x6c
[ 131.412250] __sys_sendto+0x14c/0x1bc
[ 131.412259] __arm64_sys_sendto+0x84/0xa0
[ 131.412268] el0_svc_common.constprop.0+0x1a8/0x244
[ 131.412276] do_el0_svc+0xc8/0x100
[ 131.412286] el0_svc+0x20/0x30
[ 131.412295] el0_sync_handler+0xd8/0x184
[ 131.412302] el0_sync+0x1a0/0x1c0
[ 131.412309]
/* 异常访问地址对应的数据结构空间分配的堆栈信息,找到实际分配的数据结构信息;
根据 [ 131.412368] memory_init+0x124/0x2ec [netctrl] ,通过 2、 的分析,我们可以知道
pTempV = (V_TEMP_LISTS*)kmalloc(g_max_list_num * sizeof(V_TEMP_LISTS), GFP_KERNEL),确定是此行分配了 pTempV 的空间;
g_max_list_num 为64,V_TEMP_LISTS 结构体大小为 4*6+8*4 = 56 字节,因此确认 pTempV 指针指向了一片 56 * 65 = 3584 字节的空间。
*/
[ 131.412315] Allocated by task 756:
[ 131.412325] kasan_save_stack+0x28/0x58
[ 131.412333] kasan_set_track+0x28/0x3c
[ 131.412341] ____kasan_kmalloc+0x84/0x9c
[ 131.412349] __kasan_kmalloc+0x10/0x1c
[ 131.412357] __kmalloc+0x1b4/0x234
[ 131.412368] memory_init+0x124/0x2ec [netctrl]
[ 131.412379] ly_ioctl+0x290/0x79c [netctrl]
[ 131.412387] vfs_ioctl+0x74/0x84
[ 131.412396] do_vfs_ioctl+0x6a8/0x898
[ 131.412404] __arm64_sys_ioctl+0x6c/0xbc
[ 131.412412] el0_svc_common.constprop.0+0x1a8/0x244
[ 131.412421] do_el0_svc+0xc8/0x100
[ 131.412429] el0_svc+0x20/0x30
[ 131.412438] el0_sync_handler+0xd8/0x184
[ 131.412445] el0_sync+0x1a0/0x1c0
[ 131.412451]
/* 异常访问地址附近的内存状态分析。
ffffff810cffe000: V_TEMP_LISTS 结构首地址
V_TEMP_LISTS 结构共 0xE00 = 3584 字节大小,此部分空间是可访问状态,对应 00
ffffff810cffee00: V_TEMP_LISTS 结构结束地址
中间 kmalloc 为了对齐,多分配的内存空间状态为 fc ,不可访问状态。
ffffff810cfff000: V_TEMP_LISTS 有 3584 字节大小,kmalloc为了对齐分配了 4096 字节空间
分析出2个信息:
第一:实际可访问空间内存大小:可用的起始地址为 ffffff810cffe000 ;
[ 131.412560] ffffff810cffed80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 131.412568] >ffffff810cffee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
可访问空间结束地址(00状态对应的空间地址)为:ffffff810cffee00;中间有 3584 个字节,与 V_TEMP_LISTS 结构体地址分析对应上。
第二:获取异常地址对应的成员名
注意 ^ 符号指向的地址,越界读取8字节数据的开始地址为 0xffffff810cffee18 = ffffff810cffe000 + e18(3608),对应的成员为 in_accept;上面堆栈打印 Read of size 8 at addr ffffff810cffee18 by task UPDATASOCKET/1288 也直接指出了异常地址。
typedef struct V_TEMP_LISTS
{
(ffffff810cffee00) unsigned int ip[4];
(ffffff810cffee10) unsigned int port;
(ffffff810cffee14) unsigned int seconds;
(ffffff810cffee18) unsigned long in_drop;
(ffffff810cffee20) unsigned long in_accept;
(ffffff810cffee00) unsigned long out_drop;
(ffffff810cffee00) unsigned long out_accept;
}V_TEMP_LISTS, *PV_TEMP_LISTS;
*/
[ 131.412459] The buggy address belongs to the object at ffffff810cffe000
[ 131.412459] which belongs to the cache kmalloc-4k of size 4096
[ 131.412469] The buggy address is located 3608 bytes inside of
[ 131.412469] 4096-byte region [ffffff810cffe000, ffffff810cfff000)
[ 131.412476] The buggy address belongs to the page:
[ 131.412486] page:0000000097d7c9b3 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10cff8
[ 131.412494] head:0000000097d7c9b3 order:3 compound_mapcount:0 compound_pincount:0
[ 131.412504] flags: 0x8000000000010200(slab|head)
[ 131.412514] raw: 8000000000010200 ffffffff0413fc00 0000000200000002 ffffff8100003500
[ 131.412524] raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000
[ 131.412531] page dumped because: kasan: bad access detected
[ 131.412537]
[ 131.412543] Memory state around the buggy address:
[ 131.412552] ffffff810cffed00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 131.412560] ffffff810cffed80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 131.412568] >ffffff810cffee00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 131.412574] ^
[ 131.412582] ffffff810cffee80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 131.412591] ffffff810cffef00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 131.412597] ==================================================================
5、写数据越界访问案例分析
5.1、写越界访问代码分析
static noinline void __init kmalloc_oob_right(void)
{
char *ptr;
size_t size = 123;
pr_info("out-of-bounds to right\n");
/* 定义了123个字节的空间 */
ptr = kmalloc(size, GFP_KERNEL);
if (!ptr) {
pr_err("Allocation failed\n");
return;
}
/* 写了第124个字节的地址,越界写入 */
ptr[size] = 'x';
kfree(ptr);
}
5.2、写越界kasan报告分析
/proc/dbug # echo oob > kasan
[ 108.292207]
[ 132.455158] open embedsky board device!
/proc/dbug # [ 132.457626] printk kbuf oob
[ 132.457626]
[ 132.457653] Executing function for cmd1
[ 132.457672] out-of-bounds to right
[ 132.457702] ==================================================================
/* 检测出错类型 (slab-out-of-bounds) ,出错的模块名 [dbg] */
[ 132.457743] BUG: KASAN: slab-out-of-bounds in proc_wrbuff_write+0xe8/0x24c [dbg]
/* 写数据出错,写一个字节数据时发现越界了,打出访问出错的进程和pid号 */
[ 132.457768] Write of size 1 at addr ffffff810402e47b by task echo/1151
[ 132.457787]
/* 打出访问的堆栈信息 */
[ 132.457814] CPU: 4 PID: 1151 Comm: echo Tainted: P O 5.10.66 #1
[ 132.457837] Hardware name: Rockchip RK3588 NVR DEMO LP4 V10 Board (DT)
[ 132.457857] Call trace:
[ 132.457888] dump_backtrace+0x0/0x2b8
[ 132.457914] show_stack+0x24/0x30
[ 132.457945] dump_stack_lvl+0x108/0x14c
[ 132.457978] print_address_description.constprop.0+0x38/0x280
[ 132.458006] kasan_report+0x14c/0x1f0
[ 132.458033] __asan_store1+0x3c/0x9c
[ 132.458065] proc_wrbuff_write+0xe8/0x24c [dbg]
[ 132.458092] proc_reg_write+0xf4/0x10c
[ 132.458121] vfs_write+0xc4/0x150
[ 132.458150] ksys_write+0xd8/0x158
[ 132.458178] __arm64_sys_write+0x50/0x64
[ 132.458208] el0_svc_common.constprop.0+0x1a8/0x244
[ 132.458234] do_el0_svc+0xc8/0x100
[ 132.458263] el0_svc+0x20/0x30
[ 132.458293] el0_sync_handler+0xd8/0x184
[ 132.458318] el0_sync+0x1a0/0x1c0
[ 132.458336]
/* 打印此内存创建函数的堆栈信息 */
[ 132.458355] Allocated by task 1151:
[ 132.458383] kasan_save_stack+0x28/0x58
[ 132.458409] kasan_set_track+0x28/0x3c
[ 132.458435] ____kasan_kmalloc+0x84/0x9c
[ 132.458461] __kasan_kmalloc+0x10/0x1c
[ 132.458487] kmem_cache_alloc_trace+0x168/0x1fc
[ 132.458518] proc_wrbuff_write+0x1cc/0x24c [dbg]
[ 132.458543] proc_reg_write+0xf4/0x10c
[ 132.458571] vfs_write+0xc4/0x150
[ 132.458598] ksys_write+0xd8/0x158
[ 132.458627] __arm64_sys_write+0x50/0x64
[ 132.458653] el0_svc_common.constprop.0+0x1a8/0x244
[ 132.458679] do_el0_svc+0xc8/0x100
[ 132.458706] el0_svc+0x20/0x30
[ 132.458734] el0_sync_handler+0xd8/0x184
[ 132.458758] el0_sync+0x1a0/0x1c0
[ 132.458774]
/*
出错地址位于一个数据结构(一片内存)的某个地址,数据结构的起始地址是 ffffff810402e400
这边空间是使用 kmalloc 函数分配的一片内存,大小是 128 字节;我们实际分配可能只分配了123,但是kmalloc因为对齐和分配粒度等原因多分配空间。
出问题的地址位于起始地址是 ffffff810402e400 后的第 123 个字节,问题地址属于 128-byte region [ffffff810402e400, ffffff810402e480)
*/
[ 132.458798] The buggy address belongs to the object at ffffff810402e400
[ 132.458798] which belongs to the cache kmalloc-128 of size 128
[ 132.458826] The buggy address is located 123 bytes inside of
[ 132.458826] 128-byte region [ffffff810402e400, ffffff810402e480)
[ 132.458847] The buggy address belongs to the page:
[ 132.458874] page:00000000067f7e59 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10402e
[ 132.458898] head:00000000067f7e59 order:1 compound_mapcount:0
[ 132.458924] flags: 0x8000000000010200(slab|head)
[ 132.458957] raw: 8000000000010200 ffffffff03e94880 0000000800000008 ffffff8100003c80
[ 132.458987] raw: 0000000000000000 0000000080200020 00000001ffffffff 0000000000000000
[ 132.459008] page dumped because: kasan: bad access detected
[ 132.459026]
/* 显示内存可使用情况;00 一个字节数据可以表示 8 个字节内存的状态;看源码我们分配了 123 字节的空间;
第一:特别注意 ^ 号指向的地址:>ffffff810402e400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 03,15个 00 代表我们 实际分配的空间大小 = 15*8+3 = 123;上面提到128字节,填充的5字节是不可用的。
第二:通过上面打印 Write of size 1 at addr ffffff810402e47b by task echo/1151 可知异常访问地址为: ffffff810402e47b ,也可通过下面方式计算得到:ffffff810402e400 + 7B(123),刚好是数组结束的地址。
*/
[ 132.459044] Memory state around the buggy address:
[ 132.459070] ffffff810402e300: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 132.459097] ffffff810402e380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 132.459124] >ffffff810402e400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 03
[ 132.459144] ^
[ 132.459171] ffffff810402e480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 132.459197] ffffff810402e500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 132.459217] ==================================================================
6、kasan与kdump结合
kasan 在uboot的cmdline中配置 kasan.fault=panic 则打出kasan堆栈打印之后,调用 panie 触发kdump机制。kasan 在uboot的cmdline中配置 kasan.fault=panic 则打出kasan堆栈打印之后,调用 panie 触发kdump。
X:\work\linux\linux-6.0\mm\kasan\report.c
kasan_report
start_report(&irq_flags, true); /* 打印kasan的开始: ================================================================== */
print_report(&info); /* 打印kasan中间内容: BUG: KASAN: slab-out-of-bounds in temp_inside+0x278/0x2a0 [netctrl] */
end_report(&irq_flags, ptr); /* 打印kasan的结束: ================================================================== */
if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
panic("panic_on_warn set ...\n");
if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)/* 设置 kasan.fault=panic ,调用panic,进入kump流程启动捕获内核 */
panic("kasan.fault=panic set ...\n");
kasan触发到了默认不会触发cpu死机,只是将堆栈信息打印出来;但kasan已经实现一旦检测到内存越界就触发panic功能。
/* kasan.fault=report/panic */
static int __init early_kasan_fault(char *arg)
{
if (!arg)
return -EINVAL;
if (!strcmp(arg, "report"))
kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
else if (!strcmp(arg, "panic"))
kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
else
return -EINVAL;
return 0;
}
early_param("kasan.fault", early_kasan_fault);
728

被折叠的 条评论
为什么被折叠?



