通过上一篇博客我们知道,在linker 完成自身重定位,在对可执行程序进行重定位的过程中,会初始化debuggerd,也就是注册异常处理函数,在程序发生异常的时候抓取异常信息。
4185 /*
4186 * This code is called after the linker has linked itself and
4187 * fixed it's own GOT. It is safe to make references to externs
4188 * and other non-local data at this point.
4189 */
4190 static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
...
4202 debuggerd_init(); //进行debuggerd 的初始化
...
4389 TRACE("[ Ready to execute \"%s\" @ %p ]", si->get_realpath(), reinterpret_cast<void*>(si->entry));
4390 return si->entry;
4391 }
追踪debuggerd_init() 这个函数
bionic/linker/debugger.cpp
__LIBC_HIDDEN__ void debuggerd_init() {
/*
* bionic/libc/kernel/uapi/asm-generic/signal.h
* struct sigaction {
* __sighandler_t sa_handler; //信号对应的处理函数
* unsigned long sa_flags;
* #ifdef SA_RESTORER
* __sigrestore_t sa_restorer; //处理完成之后的返回函数,一般不设置,kernel会在设置
* #endif
* sigset_t sa_mask;
* };
*/
303 struct sigaction action;
304 memset(&action, 0, sizeof(action));
305 sigemptyset(&action.sa_mask);
306 action.sa_sigaction = debuggerd_signal_handler; // debuggerd_signal_handler 就是处理函数
307 action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309 // Use the alternate signal stack if available so we can catch stack overflows.
310 action.sa_flags |= SA_ONSTACK; //使用独立的栈空间
311
312 sigaction(SIGABRT, &action, nullptr);
313 sigaction(SIGBUS, &action, nullptr);
314 sigaction(SIGFPE, &action, nullptr);
315 sigaction(SIGILL, &action, nullptr);
316 sigaction(SIGSEGV, &action, nullptr);
317 #if defined(SIGSTKFLT)
318 sigaction(SIGSTKFLT, &action, nullptr);
319 #endif
320 sigaction(SIGTRAP, &action, nullptr);
321 }
debuggerd_signal_handler 就是程序收到SIGABRT,SIGBUS,SIGFPE,SIGILL,SIGSEGV等 这几个信号时,会调用的处理函数。
bionic/libc/bionic/sigaction.cpp
36 extern "C" int __rt_sigaction(int, const struct __kernel_sigaction*, struct __kernel_sigaction*, size_t);
37
38 int sigaction(int signal, const struct sigaction* bionic_new_action, struct sigaction* bionic_old_action) {
39 __kernel_sigaction kernel_new_action;
40 if (bionic_new_action != NULL) {
41 kernel_new_action.sa_flags = bionic_new_action->sa_flags;
42 kernel_new_action.sa_handler = bionic_new_action->sa_handler;
43 kernel_new_action.sa_mask = bionic_new_action->sa_mask;
44 #if defined(SA_RESTORER)
45 kernel_new_action.sa_restorer = bionic_new_action->sa_restorer;
46 #if defined(__aarch64__)
47 // arm64 has sa_restorer, but unwinding works best if you just let the
48 // kernel supply the default restorer from [vdso]. gdb doesn't care, but
49 // libgcc needs the nop that the kernel includes before the actual code.
50 // (We could add that ourselves, but why bother?)
51 #else
52 if (!(kernel_new_action.sa_flags & SA_RESTORER)) {
53 kernel_new_action.sa_flags |= SA_RESTORER;
54 kernel_new_action.sa_restorer = &__restore_rt; //用户空间处理函数执行完后返回内核空间的函数
55 }
56 #endif
57 #endif
58 }
59
60 __kernel_sigaction kernel_old_action;
61 int result = __rt_sigaction(signal,
62 (bionic_new_action != NULL) ? &kernel_new_action : NULL,
63 (bionic_old_action != NULL) ? &kernel_old_action : NULL,
64 sizeof(sigset_t));
65
66 if (bionic_old_action != NULL) {
67 bionic_old_action->sa_flags = kernel_old_action.sa_flags;
68 bionic_old_action->sa_handler = kernel_old_action.sa_handler;
69 bionic_old_action->sa_mask = kernel_old_action.sa_mask;
70 #if defined(SA_RESTORER)
71 bionic_old_action->sa_restorer = kernel_old_action.sa_restorer;
72 #endif
73 }
74
75 return result;
76 }
这个函数中把传入的参数sigaction 转成__kernel_sigaction类型,这两个结构体其实是一样的,然后调用__rt_sigaction()注册。
bionic/libc/arch-arm64/syscalls/__rt_sigaction.S
3 #include <private/bionic_asm.h>
4
5 ENTRY(__rt_sigaction)
6 mov x8, __NR_rt_sigaction
7 svc #0 //系统调用
8
9 cmn x0, #(MAX_ERRNO + 1)
10 cneg x0, x0, hi
11 b.hi __set_errno_internal
12
13 ret
14 END(__rt_sigaction)
15 .hidden __rt_sigaction
__rt_sigaction()是一个系统调用,kernel 中对应的处理函数是do_sigaction(),系统调用的过程在fork() 对应的博客中有详细分析,这里不再分析,所以我们理所当然地认为调用了__rt_sigaction()函数后,就会跑到kernel中的do_sigaction()。
linux-4.10/kernel/signal.c
3065 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3066 {
/*
* linux-4.10/include/linux/signal.h
* struct sigaction { // sigaction 在kernel 中的定义
* #ifndef __ARCH_HAS_IRIX_SIGACTION
* __sighandler_t sa_handler;
* unsigned long sa_flags;
* #else
* unsigned int sa_flags;
* __sighandler_t sa_handler;
* #endif
* #ifdef __ARCH_HAS_SA_RESTORER
* __sigrestore_t sa_restorer;
* #endif
* sigset_t sa_mask; /* mask last for extensibility */
* };
*
* struct k_sigaction { // k_sigaction在kernel 中的定义
* struct sigaction sa; //相当于把sigaction 的成员搬到这里
* #ifdef __ARCH_HAS_KA_RESTORER
* __sigrestore_t ka_restorer;
* #endif
* };
*/
3067 struct task_struct *p = current, *t; //p 当前进程的task_struct 结构体,也就是PCB
3068 struct k_sigaction *k;
3069 sigset_t mask;
3070
3071 if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
3072 return -EINVAL;
3073
/*
* linux-4.10/include/linux/sched.h
* struct task_struct {
* ...
/* signal handlers
* struct signal_struct *signal;
* struct sighand_struct *sighand; //保存信号相关的处理函数
*
* sigset_t blocked, real_blocked;
* sigset_t saved_sigmask; // restored if set_restore_sigmask() was used
* struct sigpending pending;
*
* unsigned long sas_ss_sp; //信号处理函数独立的堆栈
* size_t sas_ss_size; //堆栈的大小
* unsigned sas_ss_flags; //相关标志位
* ...
* }
*
* struct sighand_struct {
* atomic_t count;
* struct k_sigaction action[_NSIG]; #define _NSIG 64
* spinlock_t siglock;
* wait_queue_head_t signalfd_wqh;
* };
*/
3074 k = &p->sighand->action[sig-1]; // struct k_sigaction指针,可以认为是对应信号的处理函数
3075
3076 spin_lock_irq(&p->sighand->siglock);
3077 if (oact)
3078 *oact = *k; //指向之前注册的处理函数
3079
3080 sigaction_compat_abi(act, oact);
3081
3082 if (act) {
3083 sigdelsetmask(&act->sa.sa_mask,
3084 sigmask(SIGKILL) | sigmask(SIGSTOP));
3085 *k = *act; //将user space 也就是应用程序中的handle 函数保存(注册进来)
...
3097 if (sig_handler_ignored(sig_handler(p, sig), sig)) { //handler 为 0或者 1的会被特殊处理
3098 sigemptyset(&mask);
3099 sigaddset(&mask, sig);
3100 flush_sigqueue_mask(&mask, &p->signal->shared_pending);
3101 for_each_thread(p, t)
3102 flush_sigqueue_mask(&mask, &t->pending);
3103 }
3104 }
3105
3106 spin_unlock_irq(&p->sighand->siglock);
3107 return 0;
3108 }
56 static void __user *sig_handler(struct task_struct *t, int sig) //返回之前的handle指针
57 {
58 return t->sighand->action[sig - 1].sa.sa_handler;
59 }
60
61 static int sig_handler_ignored(void __user *handler, int sig)
62 {
63 /* Is it explicitly or implicitly ignored? */
64 return handler == SIG_IGN || // #define SIG_IGN ((__force __sighandler_t)1) /* ignore signal */
65 (handler == SIG_DFL && sig_kernel_ignore(sig)); //#define SIG_DFL ((__force __sighandler_t)0) /* default signal handling */
66 }
经过上面的流程,程序中SIGABRT,SIGBUS,SIGFPE,SIGILL,SIGSEGV等信号对应的处理函数就注册完成了。接下来我们分析一下程序出现异常时,信号被处理的流程。
linux-4.10/arch/arm64/kernel/entry.S
317 ENTRY(vectors)
…
327
328 ventry el0_sync // Synchronous 64-bit EL0 //用户空间访问非法地址后走这里
329 ventry el0_irq // IRQ 64-bit EL0
330 ventry el0_fiq_invalid // FIQ 64-bit EL0
331 ventry el0_error_invalid // Error 64-bit EL0
…
344 END(vectors)
当程序在运行过程中访问到非法地址,比如空指针,或者未映射的地址,就会被处理器捕获到异常,走到对应的异常处理函数。怎么走到异常处理函数,其实很好理解,处理器捕获到异常后会跳到一个约定好的地址,kernel在初始化的时候往这些地址写对应的处理函数地址,这样就能走到处理函数中了。
512 * EL0 mode handlers.
513 */
514 .align 6
515 el0_sync:
516 kernel_entry 0 //保存用户空间的寄存器
517 mrs x25, esr_el1 // read the syndrome register
518 lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
519 cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
520 b.eq el0_svc
521 cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0 //走这条flow
522 b.eq el0_da
523 cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
524 b.eq el0_ia
发生异常时,处理器会把一些信息保存到对应的寄存器,具体是哪个寄存器,会保存什么样的信息,这里不详细介绍,异常处理函数根据寄存器中的信息,再跳到更加细化的处理函数中,程序访问非法地址,会走el0_da。
589 el0_da:
590 /*
591 * Data abort handling
592 */
593 mrs x26, far_el1
594 // enable interrupts before calling the main handler
595 enable_dbg_and_irq
596 ct_user_exit
597 bic x0, x26, #(0xff << 56) // x0 = x26 & 0x0fffffff
598 mov x1, x25 // x1 = x25
599 mov x2, sp // x2 = sp
600 bl do_mem_abort //x0 , x1 , x2作为do_mem_abort() 的三个参数
601 b ret_to_user
linux-4.10/arch/arm64/mm/fault.c
568 /*
569 * Dispatch a data abort to the relevant handler.
570 */
571 asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
572 struct pt_regs *regs)
573 {
574 const struct fault_info *inf = fault_info + (esr & 63);
575 struct siginfo info;
576
577 if (!inf->fn(addr, esr, regs)) //如果是缺页异常,尝试修复,成功就直接返回
578 return;
579
580 pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n",
581 inf->name, esr, addr);
582
583 info.si_signo = inf->sig;
584 info.si_errno = 0;
585 info.si_code = inf->code;
586 info.si_addr = (void __user *)addr;
587 arm64_notify_die("", regs, &info, esr);
588 }
进入到do_mem_abort ()后,会先判断是不是缺页导致,如果不是,的确是访问了非法地址会走arm64_notify_die()。
linux-4.10/arch/arm64/kernel/traps.c
298 void arm64_notify_die(const char *str, struct pt_regs *regs,
299 struct siginfo *info, int err)
300 {
301 if (user_mode(regs)) { //如果是用户空间的进程执行导致的异常
302 current->thread.fault_address = 0;
303 current->thread.fault_code = err;
304 force_sig_info(info->si_signo, info, current);
305 } else { //否则是kernel 里面的异常,走这里,最后会走到panic
306 die(str, regs, err);
307 }
308 }
如果是用户空间的程序访问了非法地址,会调用force_sig_info()发送信号给对应程序。
linux-4.10/kernel/signal.c
1165 int
1166 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1167 {
1168 unsigned long int flags;
1169 int ret, blocked, ignored;
1170 struct k_sigaction *action;
1171
1172 spin_lock_irqsave(&t->sighand->siglock, flags);
1173 action = &t->sighand->action[sig-1]; //保存在task_struct 里面 k_sigaction
1174 ignored = action->sa.sa_handler == SIG_IGN;
1175 blocked = sigismember(&t->blocked, sig); //测试参数sig 代表的信号是否已加入至参数set信号集里. 如果信号集里已有该信号则返回1,否则返回0。
1176 if (blocked || ignored) {
1177 action->sa.sa_handler = SIG_DFL;
1178 if (blocked) {
1179 sigdelset(&t->blocked, sig);
1180 recalc_sigpending_and_wake(t);
1181 }
1182 }
1183 if (action->sa.sa_handler == SIG_DFL)
1184 t->signal->flags &= ~SIGNAL_UNKILLABLE;
1185 ret = specific_send_sig_info(sig, info, t);
1186 spin_unlock_irqrestore(&t->sighand->siglock, flags);
1187
1188 return ret;
1189 }
接着走到specific_send_sig_info()
1134 static int
1135 specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1136 {
1137 return send_signal(sig, info, t, 0);
1138 }
specific_send_sig_info() 接着调用到send_signal()
1082 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1083 int group)
1084 {
1085 int from_ancestor_ns = 0;
1086
1087 #ifdef CONFIG_PID_NS
1088 from_ancestor_ns = si_fromuser(info) &&
1089 !task_pid_nr_ns(current, task_active_pid_ns(t));
1090 #endif
1091
1092 return __send_signal(sig, info, t, group, from_ancestor_ns);
1093 }
send_signal() 也没有太多的处理逻辑,继续调用到__send_signal(),传入的group 参数的值是0。
978 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
979 int group, int from_ancestor_ns)
980 {
...
989 if (!prepare_signal(sig, t, //对特殊信号做相应处理
990 from_ancestor_ns || (info == SEND_SIG_FORCED)))
991 goto ret;
992
993 pending = group ? &t->signal->shared_pending : &t->pending; //这时group 为0,所以信号是发给这个进程,而不是进程里面的所有线程
...
1073 out_set:
1074 signalfd_notify(t, sig); // Deliver the signal to listening signalfd
1075 sigaddset(&pending->signal, sig);
1076 complete_signal(sig, t, group); //继续往下走到这里
1077 ret:
1078 trace_signal_generate(sig, info, t, group, result);
1079 return ret;
1080 }
__send_signal()里面做了各种各样的check,因为我们这里主要是熟悉这个流程,对于里面的具体细节,不做过多的介绍,需要了解的同学可以自己看源码。
876 static void complete_signal(int sig, struct task_struct *p, int group)
877 {
878 struct signal_struct *signal = p->signal;
879 struct task_struct *t;
880
881 /*
882 * Now find a thread we can wake up to take the signal off the queue.
883 *
884 * If the main thread wants the signal, it gets first crack.
885 * Probably the least surprising to the average bear.
886 */
887 if (wants_signal(sig, p))
888 t = p;
889 else if (!group || thread_group_empty(p))
890 /*
891 * There is just one thread and it does not need to be woken.
892 * It will dequeue unblocked signals before it runs again.
893 */
894 return;
895 else {
896 /*
897 * Otherwise try to find a suitable thread.
898 */
899 t = signal->curr_target;
900 while (!wants_signal(sig, t)) {
901 t = next_thread(t);
902 if (t == signal->curr_target)
903 /*
904 * No thread needs to be woken.
905 * Any eligible threads will see
906 * the signal in the queue soon.
907 */
908 return;
909 }
910 signal->curr_target = t;
911 }
912
913 /*
914 * Found a killable thread. If the signal will be fatal,
915 * then start taking the whole group down immediately.
916 */
917 if (sig_fatal(p, sig) &&
918 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
919 !sigismember(&t->real_blocked, sig) &&
920 (sig == SIGKILL || !t->ptrace)) {
921 /*
922 * This signal will be fatal to the whole group.
923 */
924 if (!sig_kernel_coredump(sig)) {
925 /*
926 * Start a group exit and wake everybody up.
927 * This way we don't have other threads
928 * running and doing things after a slower
929 * thread has the fatal signal pending.
930 */
931 signal->flags = SIGNAL_GROUP_EXIT;
932 signal->group_exit_code = sig;
933 signal->group_stop_count = 0;
934 t = p;
935 do {
936 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
937 sigaddset(&t->pending.signal, SIGKILL);
938 signal_wake_up(t, 1);
939 } while_each_thread(p, t);
940 return;
941 }
942 }
943
944 /*
945 * The signal is already in the shared-pending queue.
946 * Tell the chosen thread to wake up and dequeue it.
947 */
948 signal_wake_up(t, sig == SIGKILL);
949 return;
950 }
complete_signal() 对信号在进行了进一步处理,最后调用到signal_wake_up()
linux-4.10/include/uapi/asm-generic/signal.h
10 #define SIGHUP 1
11 #define SIGINT 2
12 #define SIGQUIT 3
13 #define SIGILL 4 //执行了非法指令. 通常是因为可执行文件本身出现错误, 或者试图执行数据段. 堆栈溢出时也有可能产生这个信号。
14 #define SIGTRAP 5
15 #define SIGABRT 6
16 #define SIGIOT 6 //调用abort函数生成的信号
17 #define SIGBUS 7
18 #define SIGFPE 8
19 #define SIGKILL 9 //用来立即结束程序的运行. 本信号不能被阻塞、处理和忽略。
20 #define SIGUSR1 10
21 #define SIGSEGV 11 //试图访问未分配给自己的内存, 或试图往没有写权限的内存地址写数据.
22 #define SIGUSR2 12
23 #define SIGPIPE 13 //管道破裂。这个信号通常在进程间通信产生
24 #define SIGALRM 14
25 #define SIGTERM 15
26 #define SIGSTKFLT 16
27 #define SIGCHLD 17
28 #define SIGCONT 18 //让一个停止(stopped)的进程继续执行. 本信号不能被阻塞.
29 #define SIGSTOP 19
30 #define SIGTSTP 20
上面给出了一些信号对应的值,在实际中,遇到最多的情况就是SIGSEGV,也就是访问了非法地址,应该90%以上是这种情况。
linux-4.10/include/linux/sched.h
3520 static inline void signal_wake_up(struct task_struct *t, bool resume)
3521 {
3522 signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
3523 }
现在考虑的是SIGSEGV 信号的情况,所以上面传下来的resume 为0,调用到signal_wake_up_state() 传入的第二个参数也是0
linux-4.10/kernel/signal.c
645 void signal_wake_up_state(struct task_struct *t, unsigned int state)
646 {
647 set_tsk_thread_flag(t, TIF_SIGPENDING);
648 /*
649 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
650 * case. We don't check t->state here because there is a race with it
651 * executing another processor and just now entering stopped state.
652 * By using wake_up_state, we ensure the process will wake up and
653 * handle its death signal.
654 */
655 if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
656 kick_process(t);
657 }
走了一圈,有回到signal.c 这个文件里面的代码,set_tsk_thread_flag()会把thread_info.flags,设置成TIF_SIGPENDING,wake_up_state()和kick_process()就不深入了解了。
linux-4.10/arch/arm64/kernel/entry.S
589 el0_da:
590 /*
591 * Data abort handling
592 */
593 mrs x26, far_el1
594 // enable interrupts before calling the main handler
595 enable_dbg_and_irq
596 ct_user_exit
597 bic x0, x26, #(0xff << 56)
598 mov x1, x25
599 mov x2, sp
//bl (表示带返回值的跳转) 带链接的跳转。 首先将当前指令的下一条指令地址保存在LR寄存器,然后跳转的lable
//b 表示无条件跳转
600 bl do_mem_abort //do_mem_abort在kernel 走了一圈,返回了
601 b ret_to_user //
重新回到entry.S 的el0_da 代码块,do_mem_abort跑完了之后,会继续往下,跑到ret_to_user
/*
770 * "slow" syscall return path.
771 */
772 ret_to_user:
773 disable_irq // disable interrupts
774 ldr x1, [tsk, #TSK_TI_FLAGS] //tsk .req x28 current thread_info x1 = thread_info.flags
775 and x2, x1, #_TIF_WORK_MASK // x2 = x1 & _TIF_WORK_MASK 也就是取出thread_info.flags 的标志位
776 cbnz x2, work_pending // 如果x2 != 0 跳到 work_pending
777 finish_ret_to_user:
778 enable_step_tsk x1, x2
779 kernel_exit 0
780 ENDPROC(ret_to_user)
前面说了set_tsk_thread_flag()会把thread_info.flags设置成TIF_SIGPENDING(增加这个flag),所以这里会走到work_pending。
758 /*
759 * Ok, we need to do extra processing, enter the slow path.
760 */
761 work_pending:
762 mov x0, sp // 'regs'
763 bl do_notify_resume
764 #ifdef CONFIG_TRACE_IRQFLAGS
765 bl trace_hardirqs_on // enabled while in userspace
766 #endif
767 ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
768 b finish_ret_to_user
继续往下会调用到do_notify_resume()函数
linux-4.10/arch/arm64/kernel/signal.c
402 asmlinkage void do_notify_resume(struct pt_regs *regs,
403 unsigned int thread_flags)
404 {
405 /*
406 * The assembly code enters us with IRQs off, but it hasn't
407 * informed the tracing code of that for efficiency reasons.
408 * Update the trace code with the current status.
409 */
410 trace_hardirqs_off();
411 do {
412 if (thread_flags & _TIF_NEED_RESCHED) {
413 schedule();
414 } else {
415 local_irq_enable();
416
417 if (thread_flags & _TIF_UPROBE)
418 uprobe_notify_resume(regs);
419
420 if (thread_flags & _TIF_SIGPENDING) //走的是这里
421 do_signal(regs);
422
423 if (thread_flags & _TIF_NOTIFY_RESUME) {
424 clear_thread_flag(TIF_NOTIFY_RESUME);
425 tracehook_notify_resume(regs);
426 }
427
428 if (thread_flags & _TIF_FOREIGN_FPSTATE)
429 fpsimd_restore_current_state();
430 }
431
432 local_irq_disable();
433 thread_flags = READ_ONCE(current_thread_info()->flags);
434 } while (thread_flags & _TIF_WORK_MASK);
435 }
通过前面的了解,我们知道thread_flags & _TIF_SIGPENDING 这个条件是成立的,所以继续走到do_signal(regs);
static void do_signal(struct pt_regs *regs)
332 {
...
366 /*
367 * Get the signal to deliver. When running under ptrace, at this point
368 * the debugger may change all of our registers.
369 */
370 if (get_signal(&ksig)) {
...
385 handle_signal(&ksig, regs);
386 return;
387 }
...
399 restore_saved_sigmask();
400 }
接着会调用handle_signal() 函数
285 /*
286 * OK, we're invoking a handler
287 */
288 static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
289 {
290 struct task_struct *tsk = current;
291 sigset_t *oldset = sigmask_to_save();
292 int usig = ksig->sig;
293 int ret;
294
295 /*
296 * Set up the stack frame
297 */
298 if (is_compat_task()) {
299 if (ksig->ka.sa.sa_flags & SA_SIGINFO)
300 ret = compat_setup_rt_frame(usig, ksig, oldset, regs);
301 else
302 ret = compat_setup_frame(usig, ksig, oldset, regs);
303 } else {
304 ret = setup_rt_frame(usig, ksig, oldset, regs); //走这里
305 }
306
307 /*
308 * Check that the resulting registers are actually sane.
309 */
310 ret |= !valid_user_regs(®s->user_regs, current);
311
312 /*
313 * Fast forward the stepping logic so we step into the signal
314 * handler.
315 */
316 if (!ret)
317 user_fastforward_single_step(tsk);
318
319 signal_setup_done(ret, ksig, 0);
320 }
我们重点看setup_rt_frame()做了什么。
250 static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
251 struct pt_regs *regs)
252 {
253 struct rt_sigframe __user *frame;
254 int err = 0;
255
256 frame = get_sigframe(ksig, regs); //获取用户空间处理信号的栈
257 if (!frame)
258 return 1;
259
260 __put_user_error(0, &frame->uc.uc_flags, err);
261 __put_user_error(NULL, &frame->uc.uc_link, err);
262
263 err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
264 err |= setup_sigframe(frame, regs, set);
265 if (err == 0) {
266 setup_return(regs, &ksig->ka, frame, usig);
267 if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
268 err |= copy_siginfo_to_user(&frame->info, &ksig->info);
269 regs->regs[1] = (unsigned long)&frame->info;
270 regs->regs[2] = (unsigned long)&frame->uc;
271 }
272 }
273
274 return err;
275 }
setup_rt_frame()会用户空间执行信号处理函数准备好栈和相关参数。232 static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
233 void __user *frame, int usig)
234 {
235 __sigrestore_t sigtramp;
236
237 regs->regs[0] = usig;
238 regs->sp = (unsigned long)frame;
239 regs->regs[29] = regs->sp + offsetof(struct rt_sigframe, fp);
240 regs->pc = (unsigned long)ka->sa.sa_handler; //之前注册的用户空间处理函数
241
242 if (ka->sa.sa_flags & SA_RESTORER)
243 sigtramp = ka->sa.sa_restorer;
244 else
245 sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);
246
247 regs->regs[30] = (unsigned long)sigtramp; //用户空间的处理函数执行完后,会调用这个函数再返回内核空间
248 }
regs->pc 设置成之前用户空间传下来的处理函数,所以返回用户空间时会执行处理函数,regs->regs[30]设置里用户空间执行完处理函数后,通过这个函数再返回内核空间。
linux-4.10/arch/arm64/kernel/vdso/vdso.lds.S
/*
96 * Make the sigreturn code visible to the kernel.
97 */
98 VDSO_sigtramp = __kernel_rt_sigreturn;
linux-4.10/arch/arm64/kernel/vdso/sigreturn.S
28 ENTRY(__kernel_rt_sigreturn)
29 .cfi_startproc
30 .cfi_signal_frame
31 .cfi_def_cfa x29, 0
32 .cfi_offset x29, 0 * 8
33 .cfi_offset x30, 1 * 8
34 mov x8, #__NR_rt_sigreturn
35 svc #0
36 .cfi_endproc
37 ENDPROC(__kernel_rt_sigreturn)
所以上面的sigtramp 里面插入了一个系统调用__NR_rt_sigreturn,也就是信号处理后返回。
bionic/linker/debugger.cpp
302 __LIBC_HIDDEN__ void debuggerd_init() {
...
306 action.sa_sigaction = debuggerd_signal_handler;
...
321 }
再会到debuggerd_init()中,kernel 将信号发生出来后,经常许多环节的处理,会返回用户空间,调用debuggerd_signal_handler()这个处理函数。
262 static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
...
271 send_debuggerd_packet(info);
...
294 int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);
295 if (rc != 0) {
296 __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297 strerror(errno));
298 _exit(0);
299 }
300 }
这里会调用send_debuggerd_packet() 向debuggerd进程发生信息。208 static void send_debuggerd_packet(siginfo_t* info) {
...
226 int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC); //创建socket连接
227 if (s == -1) {
228 __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",
229 strerror(errno));
230 return;
231 }
232
233 // debuggerd knows our pid from the credentials on the
234 // local socket but we need to tell it the tid of the crashing thread.
235 // debuggerd will be paranoid and verify that we sent a tid
236 // that's actually in our process.
237 debugger_msg_t msg;
238 msg.action = DEBUGGER_ACTION_CRASH;
239 msg.tid = gettid(); //消息中有tid,也就是出现异常的线程号
240 msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message);
241 msg.original_si_code = (info != nullptr) ? info->si_code : 0;
242 ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg))); //将消息通过socket 发送给debuggerd
...
255 close(s);
256 }
接着debuggerd 会收到发送过来的消息
system/core/debuggerd/debuggerd.cpp
921 int main(int argc, char** argv) {
922 union selinux_callback cb;
923 if (argc == 1) {
924 cb.func_audit = audit_callback;
925 selinux_set_callback(SELINUX_CB_AUDIT, cb);
926 cb.func_log = selinux_log_callback;
927 selinux_set_callback(SELINUX_CB_LOG, cb);
928 return do_server();
929 }
930
931 bool dump_backtrace = false;
932 bool have_tid = false;
933 pid_t tid = 0;
934 for (int i = 1; i < argc; i++) {
935 if (!strcmp(argv[i], "-b")) {
936 dump_backtrace = true;
937 } else if (!have_tid) {
938 tid = atoi(argv[i]);
939 have_tid = true;
940 } else {
941 usage();
942 return 1;
943 }
944 }
945 if (!have_tid) {
946 usage();
947 return 1;
948 }
949 return do_explicit_dump(tid, dump_backtrace);
950 }
debuggerd进程运行起来之后,会根据参数走不同的流程,如果是默认的走do_server(),也就是创建一个socket server,等待client端连接,处理相应的请求。另一种情况就是我们手动调用它dump 某个进程的backtrace,前面有讲过,adb shell debuggerd -b pid .
842 static int do_server() {
843 // debuggerd crashes can't be reported to debuggerd.
844 // Reset all of the crash handlers.
845 signal(SIGABRT, SIG_DFL);
846 signal(SIGBUS, SIG_DFL);
847 signal(SIGFPE, SIG_DFL);
848 signal(SIGILL, SIG_DFL);
849 signal(SIGSEGV, SIG_DFL);
850 #ifdef SIGSTKFLT
851 signal(SIGSTKFLT, SIG_DFL);
852 #endif
853 signal(SIGTRAP, SIG_DFL);
854
855 // Ignore failed writes to closed sockets
856 signal(SIGPIPE, SIG_IGN);
857
858 // Block SIGCHLD so we can sigtimedwait for it.
859 sigset_t sigchld;
860 sigemptyset(&sigchld);
861 sigaddset(&sigchld, SIGCHLD);
862 sigprocmask(SIG_SETMASK, &sigchld, nullptr);
863
864 int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT, //创建socket ,充当server端
865 SOCK_STREAM | SOCK_CLOEXEC);
866 if (s == -1) return 1;
867
868 // Fork a process that stays root, and listens on a pipe to pause and resume the target.
869 if (!start_signal_sender()) {
870 ALOGE("debuggerd: failed to fork signal sender");
871 return 1;
872 }
873
874 ALOGI("debuggerd: starting\n");
875
876 for (;;) { //循环等待
877 sockaddr_storage ss;
878 sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
879 socklen_t alen = sizeof(ss);
880
881 ALOGV("waiting for connection\n");
882 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC); //等待client端来连接
883 if (fd == -1) {
884 ALOGE("accept failed: %s\n", strerror(errno));
885 continue;
886 }
887
888 handle_request(fd); //处理client端的请求
889 }
890 return 0;
891 }
do_server() 会把debuggerd 自身crash 时的信号屏蔽掉,然后创建socket,充当server,调用accept()等待client端来连接,收到连接后,调用handle_request(fd)处理。
801 static void handle_request(int fd) {
802 ALOGV("handle_request(%d)\n", fd);
803
804 ScopedFd closer(fd);
805 debugger_request_t request;
806 memset(&request, 0, sizeof(request));
807 int status = read_request(fd, &request); //读取client端发生过来的消息
808 if (status != 0) {
809 return;
810 }
...
831 // Fork a child to handle the rest of the request.
832 pid_t fork_pid = fork();
833 if (fork_pid == -1) {
834 ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
835 } else if (fork_pid == 0) {
836 worker_process(fd, request); //fork 出子进程来处理
837 } else {
838 monitor_worker_process(fork_pid, request);
839 }
840 }
handle_request()会把client 端,也就是信号处理函数中发生过来的信息读出来,然后创建出一个子进程继续处理。
565 static void worker_process(int fd, debugger_request_t& request) {
...
598 // Attach to the target process.
599 if (!ptrace_attach_thread(request.pid, request.tid)) { //ptrace 进程
600 ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
601 exit(1);
602 }
...
608 if (request.action == DEBUGGER_ACTION_CRASH) { //通过前面的代码我们知道action是 DEBUGGER_ACTION_CRASH
609 pid_t pid;
610 uid_t uid;
611 gid_t gid;
612 if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {
613 ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);
614 exit(1);
615 }
...
624 }
625
626 // Don't attach to the sibling threads if we want to attach gdb.
627 // Supposedly, it makes the process less reliable.
628 bool attach_gdb = should_attach_gdb(request);
629 if (attach_gdb) {
630 // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.
631 if (init_getevent() != 0) {
632 ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");
633 attach_gdb = false;
634 }
636 }
...
662 int crash_signal = SIGKILL;
663 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings, //dump 异常进程的 寄存器 和backtrace 信息
664 &crash_signal, amfd_data.get());
...
692 for (pid_t sibling : siblings) {
693 ptrace(PTRACE_DETACH, sibling, 0, 0); //ptrace DETACH
694 }
...
717
718 close(amfd); //关闭socket连接
719
720 exit(!succeeded);
721 }
worker_process()中,会PTRACE_ATTACH 上发生异常的进程,然后dump 出进程的信息用于debug,最后PTRACE_DETACH 该进程,关于ptrace 的功能,这里不再介绍,用户空间调用ptrace 实际上是系统调用的接口,真正的实现在kernel中。483 static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
484 BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
485 int* crash_signal, std::string* amfd_data) {
...
492 while (true) {
493 int signal = wait_for_signal(request.tid, &total_sleep_time_usec);
494 switch (signal) {
495 case -1:
496 ALOGE("debuggerd: timed out waiting for signal");
497 return false;
498
...
517 case SIGABRT:
518 case SIGBUS:
519 case SIGFPE:
520 case SIGILL:
521 case SIGSEGV:
522 #ifdef SIGSTKFLT
523 case SIGSTKFLT:
524 #endif
525 case SIGSYS:
526 case SIGTRAP:
527 ALOGV("stopped -- fatal signal\n");
528 *crash_signal = signal;
529 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
530 request.original_si_code, request.abort_msg_address, amfd_data);
531 break;
532
533 default:
534 ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
535 break;
536 }
537 break;
538 }
539
540 return true;
541 }
ptrace 上出现异常的进程后,该进程会重新跑起来,还是跑出现异常的那段代码,所以又会发生异常,但是这时候的异常信息不会发生给debuggerd_signal_handler()处理函数,而是给当前的debuggerd进程。debugger进程收到信号后会调用engrave_tombstone().
system/core/debuggerd/tombstone.cpp688 void engrave_tombstone(int tombstone_fd, BacktraceMap* map, pid_t pid, pid_t tid,
689 const std::set<pid_t>& siblings, int signal, int original_si_code,
690 uintptr_t abort_msg_address, std::string* amfd_data) {
691 log_t log;
692 log.current_tid = tid;
693 log.crashed_tid = tid;
694
695 if (tombstone_fd < 0) {
696 ALOGE("debuggerd: skipping tombstone write, nothing to do.\n");
697 return;
698 }
699
700 log.tfd = tombstone_fd;
701 log.amfd_data = amfd_data;
702 dump_crash(&log, map, pid, tid, siblings, signal, original_si_code, abort_msg_address);
703 }
走到dump_crash() dump相关信息
607 // Dumps all information about the specified pid to the tombstone.
608 static void dump_crash(log_t* log, BacktraceMap* map, pid_t pid, pid_t tid,
609 const std::set<pid_t>& siblings, int signal, int si_code,
610 uintptr_t abort_msg_address) {
611 // don't copy log messages to tombstone unless this is a dev device
612 char value[PROPERTY_VALUE_MAX];
613 property_get("ro.debuggable", value, "0");
614 bool want_logs = (value[0] == '1');
615
616 _LOG(log, logtype::HEADER,
617 "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
618 dump_header_info(log);
619 dump_thread(log, pid, tid, map, signal, si_code, abort_msg_address, true);
620 if (want_logs) {
621 dump_logs(log, pid, 5);
622 }
623
624 if (!siblings.empty()) {
625 for (pid_t sibling : siblings) {
626 dump_thread(log, pid, sibling, map, 0, 0, 0, false);
627 }
628 }
629
630 if (want_logs) {
631 dump_logs(log, pid, 0);
632 }
633 }