android 抓取crash信息流程简介

最新推荐文章于 2024-09-06 22:13:50 发布

xiongtiancheng

最新推荐文章于 2024-09-06 22:13:50 发布

阅读量2.3k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： debuggerd 文章标签： android crash debuggerd dump

本文链接：https://blog.youkuaiyun.com/xiongtiancheng/article/details/78824730

debuggerd 专栏收录该内容

0 篇文章

订阅专栏

本文介绍了Android系统中处理Crash的流程，从linker的debuggerd初始化开始，详细讲解了异常信号的注册、处理，以及debuggerd如何在接收到异常信息后进行堆栈跟踪和崩溃信息dump的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

通过上一篇博客我们知道，在linker 完成自身重定位，在对可执行程序进行重定位的过程中，会初始化debuggerd，也就是注册异常处理函数，在程序发生异常的时候抓取异常信息。

4185  /*
4186   * This code is called after the linker has linked itself and
4187   * fixed it's own GOT. It is safe to make references to externs
4188   * and other non-local data at this point.
4189   */
4190  static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
...
4202    debuggerd_init();  //进行debuggerd 的初始化
...
4389    TRACE("[ Ready to execute \"%s\" @ %p ]", si->get_realpath(), reinterpret_cast<void*>(si->entry));
4390    return si->entry;
4391  }

追踪debuggerd_init() 这个函数

bionic/linker/debugger.cpp

 __LIBC_HIDDEN__ void debuggerd_init() {
/*
 * bionic/libc/kernel/uapi/asm-generic/signal.h
 * struct sigaction {
 * __sighandler_t sa_handler;  //信号对应的处理函数
 *   unsigned long sa_flags;
 *  #ifdef SA_RESTORER
 *   __sigrestore_t sa_restorer;  //处理完成之后的返回函数，一般不设置，kernel会在设置
 * #endif
 *   sigset_t sa_mask;
 * };
*/
303    struct sigaction action;
304    memset(&action, 0, sizeof(action));
305    sigemptyset(&action.sa_mask);
306    action.sa_sigaction = debuggerd_signal_handler;  // debuggerd_signal_handler 就是处理函数
307    action.sa_flags = SA_RESTART | SA_SIGINFO;
308  
309    // Use the alternate signal stack if available so we can catch stack overflows.
310    action.sa_flags |= SA_ONSTACK;  //使用独立的栈空间
311  
312    sigaction(SIGABRT, &action, nullptr);
313    sigaction(SIGBUS, &action, nullptr);
314    sigaction(SIGFPE, &action, nullptr);
315    sigaction(SIGILL, &action, nullptr);
316    sigaction(SIGSEGV, &action, nullptr);
317  #if defined(SIGSTKFLT)
318    sigaction(SIGSTKFLT, &action, nullptr);
319  #endif
320    sigaction(SIGTRAP, &action, nullptr);
321  }

debuggerd_signal_handler 就是程序收到SIGABRT，SIGBUS，SIGFPE，SIGILL，SIGSEGV等这几个信号时，会调用的处理函数。

bionic/libc/bionic/sigaction.cpp

36  extern "C" int __rt_sigaction(int, const struct __kernel_sigaction*, struct __kernel_sigaction*, size_t);
37  
38  int sigaction(int signal, const struct sigaction* bionic_new_action, struct sigaction* bionic_old_action) {
39    __kernel_sigaction kernel_new_action;
40    if (bionic_new_action != NULL) {
41      kernel_new_action.sa_flags = bionic_new_action->sa_flags;
42      kernel_new_action.sa_handler = bionic_new_action->sa_handler;
43      kernel_new_action.sa_mask = bionic_new_action->sa_mask;
44  #if defined(SA_RESTORER)
45      kernel_new_action.sa_restorer = bionic_new_action->sa_restorer;
46  #if defined(__aarch64__)
47      // arm64 has sa_restorer, but unwinding works best if you just let the
48      // kernel supply the default restorer from [vdso]. gdb doesn't care, but
49      // libgcc needs the nop that the kernel includes before the actual code.
50      // (We could add that ourselves, but why bother?)
51  #else
52      if (!(kernel_new_action.sa_flags & SA_RESTORER)) {
53        kernel_new_action.sa_flags |= SA_RESTORER;
54        kernel_new_action.sa_restorer = &__restore_rt;  //用户空间处理函数执行完后返回内核空间的函数
55      }
56  #endif
57  #endif
58    }
59  
60    __kernel_sigaction kernel_old_action;
61    int result = __rt_sigaction(signal,
62                                (bionic_new_action != NULL) ? &kernel_new_action : NULL,
63                                (bionic_old_action != NULL) ? &kernel_old_action : NULL,
64                                sizeof(sigset_t));
65  
66    if (bionic_old_action != NULL) {
67      bionic_old_action->sa_flags = kernel_old_action.sa_flags;
68      bionic_old_action->sa_handler = kernel_old_action.sa_handler;
69      bionic_old_action->sa_mask = kernel_old_action.sa_mask;
70  #if defined(SA_RESTORER)
71      bionic_old_action->sa_restorer = kernel_old_action.sa_restorer;
72  #endif
73    }
74  
75    return result;
76  }

这个函数中把传入的参数sigaction 转成__kernel_sigaction类型，这两个结构体其实是一样的，然后调用__rt_sigaction()注册。

bionic/libc/arch-arm64/syscalls/__rt_sigaction.S

3  #include <private/bionic_asm.h>
4  
5  ENTRY(__rt_sigaction)
6      mov     x8, __NR_rt_sigaction
7      svc     #0  //系统调用
8  
9      cmn     x0, #(MAX_ERRNO + 1)
10      cneg    x0, x0, hi
11      b.hi    __set_errno_internal
12  
13      ret
14  END(__rt_sigaction)
15  .hidden __rt_sigaction

__rt_sigaction()是一个系统调用，kernel 中对应的处理函数是do_sigaction()，系统调用的过程在fork() 对应的博客中有详细分析，这里不再分析，所以我们理所当然地认为调用了__rt_sigaction()函数后，就会跑到kernel中的do_sigaction()。

linux-4.10/kernel/signal.c

3065  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
3066  {
/*
 * linux-4.10/include/linux/signal.h
 *  struct sigaction {   // sigaction 在kernel 中的定义
 *  #ifndef __ARCH_HAS_IRIX_SIGACTION
 *  	__sighandler_t	sa_handler;
 *  	unsigned long	sa_flags;
 *  #else
 *   	unsigned int	sa_flags;
 *  	__sighandler_t	sa_handler;
 *   #endif
 *   #ifdef __ARCH_HAS_SA_RESTORER
 *   	__sigrestore_t sa_restorer;
 *   #endif
 *   	sigset_t	sa_mask;	/* mask last for extensibility */
 *  };
 * 
 *   struct k_sigaction { // k_sigaction在kernel 中的定义
 *   	struct sigaction sa;   //相当于把sigaction 的成员搬到这里
 *   #ifdef __ARCH_HAS_KA_RESTORER
 *  	__sigrestore_t ka_restorer;
 *   #endif
 *  };
*/
3067  	struct task_struct *p = current, *t; //p 当前进程的task_struct 结构体，也就是PCB
3068  	struct k_sigaction *k;
3069  	sigset_t mask;
3070  
3071  	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
3072  		return -EINVAL;
3073  
/*
 * linux-4.10/include/linux/sched.h
 * struct task_struct {
 * ...
/*   signal handlers 
 *   	struct signal_struct *signal;
 * 	struct sighand_struct *sighand;  //保存信号相关的处理函数
 * 
 * 	sigset_t blocked, real_blocked;
 * 	sigset_t saved_sigmask;	// restored if set_restore_sigmask() was used 
 *  	struct sigpending pending;
 *  
 *  	unsigned long sas_ss_sp;  //信号处理函数独立的堆栈
 *   	size_t sas_ss_size;   //堆栈的大小
 * 	unsigned sas_ss_flags;   //相关标志位
 * ...
 * }
 * 
 *  struct sighand_struct {
 *  	atomic_t		count;
 * 	struct k_sigaction	action[_NSIG];  #define _NSIG	64
 *   	spinlock_t		siglock;
 *   	wait_queue_head_t	signalfd_wqh;
 *  };
*/
3074  	k = &p->sighand->action[sig-1];  // struct k_sigaction指针，可以认为是对应信号的处理函数
3075  
3076  	spin_lock_irq(&p->sighand->siglock);
3077  	if (oact)
3078  		*oact = *k;  //指向之前注册的处理函数
3079  
3080  	sigaction_compat_abi(act, oact);
3081  
3082  	if (act) {
3083  		sigdelsetmask(&act->sa.sa_mask,
3084  			      sigmask(SIGKILL) | sigmask(SIGSTOP));
3085  		*k = *act;  //将user space 也就是应用程序中的handle 函数保存（注册进来）
...
3097  		if (sig_handler_ignored(sig_handler(p, sig), sig)) {  //handler 为 0或者 1的会被特殊处理
3098  			sigemptyset(&mask);
3099  			sigaddset(&mask, sig);
3100  			flush_sigqueue_mask(&mask, &p->signal->shared_pending);
3101  			for_each_thread(p, t)
3102  				flush_sigqueue_mask(&mask, &t->pending);
3103  		}
3104  	}
3105  
3106  	spin_unlock_irq(&p->sighand->siglock);
3107  	return 0;
3108  }

56  static void __user *sig_handler(struct task_struct *t, int sig)  //返回之前的handle指针
57  {
58  	return t->sighand->action[sig - 1].sa.sa_handler;
59  }
60
61  static int sig_handler_ignored(void __user *handler, int sig)
62  {
63  	/* Is it explicitly or implicitly ignored? */
64  	return handler == SIG_IGN || // #define SIG_IGN	((__force __sighandler_t)1)	/* ignore signal */
65  		(handler == SIG_DFL && sig_kernel_ignore(sig)); //#define SIG_DFL	((__force __sighandler_t)0)	/* default signal handling */
66  }

经过上面的流程，程序中SIGABRT，SIGBUS，SIGFPE，SIGILL，SIGSEGV等信号对应的处理函数就注册完成了。接下来我们分析一下程序出现异常时，信号被处理的流程。

linux-4.10/arch/arm64/kernel/entry.S

317  ENTRY(vectors)
…
327  
328  	ventry	el0_sync			// Synchronous 64-bit EL0  //用户空间访问非法地址后走这里
329  	ventry	el0_irq				// IRQ 64-bit EL0
330  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
331  	ventry	el0_error_invalid		// Error 64-bit EL0
…
344  END(vectors)

当程序在运行过程中访问到非法地址，比如空指针，或者未映射的地址，就会被处理器捕获到异常，走到对应的异常处理函数。怎么走到异常处理函数，其实很好理解，处理器捕获到异常后会跳到一个约定好的地址，kernel在初始化的时候往这些地址写对应的处理函数地址，这样就能走到处理函数中了。

512   * EL0 mode handlers.
513   */
514  	.align	6
515  el0_sync:
516  	kernel_entry 0  //保存用户空间的寄存器
517  	mrs	x25, esr_el1			// read the syndrome register
518  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
519  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
520  	b.eq	el0_svc
521  	cmp	x24, #ESR_ELx_EC_DABT_LOW	// data abort in EL0  //走这条flow
522  	b.eq	el0_da
523  	cmp	x24, #ESR_ELx_EC_IABT_LOW	// instruction abort in EL0
524  	b.eq	el0_ia

发生异常时，处理器会把一些信息保存到对应的寄存器，具体是哪个寄存器，会保存什么样的信息，这里不详细介绍，异常处理函数根据寄存器中的信息，再跳到更加细化的处理函数中，程序访问非法地址，会走el0_da。

589  el0_da:
590  	/*
591  	 * Data abort handling
592  	 */
593  	mrs	x26, far_el1
594  	// enable interrupts before calling the main handler
595  	enable_dbg_and_irq
596  	ct_user_exit
597  	bic	x0, x26, #(0xff << 56)  // x0 = x26 & 0x0fffffff
598  	mov	x1, x25   // x1 = x25
599  	mov	x2, sp    // x2 = sp
600  	bl	do_mem_abort  //x0 , x1 , x2作为do_mem_abort() 的三个参数
601  	b	ret_to_user

linux-4.10/arch/arm64/mm/fault.c

568  /*
569   * Dispatch a data abort to the relevant handler.
570   */
571  asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
572  					 struct pt_regs *regs)
573  {
574  	const struct fault_info *inf = fault_info + (esr & 63);
575  	struct siginfo info;
576  
577  	if (!inf->fn(addr, esr, regs))  //如果是缺页异常，尝试修复，成功就直接返回
578  		return;
579  
580  	pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n",
581  		 inf->name, esr, addr);
582  
583  	info.si_signo = inf->sig;
584  	info.si_errno = 0;
585  	info.si_code  = inf->code;
586  	info.si_addr  = (void __user *)addr;
587  	arm64_notify_die("", regs, &info, esr);   
588  }

进入到do_mem_abort ()后，会先判断是不是缺页导致，如果不是，的确是访问了非法地址会走arm64_notify_die()。

linux-4.10/arch/arm64/kernel/traps.c

298  void arm64_notify_die(const char *str, struct pt_regs *regs,
299  		      struct siginfo *info, int err)
300  {
301  	if (user_mode(regs)) {  //如果是用户空间的进程执行导致的异常
302  		current->thread.fault_address = 0;
303  		current->thread.fault_code = err;
304  		force_sig_info(info->si_signo, info, current); 
305  	} else { //否则是kernel 里面的异常，走这里，最后会走到panic
306  		die(str, regs, err);
307  	}
308  }

如果是用户空间的程序访问了非法地址，会调用force_sig_info()发送信号给对应程序。

linux-4.10/kernel/signal.c

1165  int
1166  force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1167  {
1168  	unsigned long int flags;
1169  	int ret, blocked, ignored;
1170  	struct k_sigaction *action;
1171  
1172  	spin_lock_irqsave(&t->sighand->siglock, flags);
1173  	action = &t->sighand->action[sig-1];  //保存在task_struct 里面 k_sigaction
1174  	ignored = action->sa.sa_handler == SIG_IGN;
1175  	blocked = sigismember(&t->blocked, sig); //测试参数sig 代表的信号是否已加入至参数set信号集里. 如果信号集里已有该信号则返回1，否则返回0。
1176  	if (blocked || ignored) {
1177  		action->sa.sa_handler = SIG_DFL;
1178  		if (blocked) {
1179  			sigdelset(&t->blocked, sig);
1180  			recalc_sigpending_and_wake(t);
1181  		}
1182  	}
1183  	if (action->sa.sa_handler == SIG_DFL)
1184  		t->signal->flags &= ~SIGNAL_UNKILLABLE;
1185  	ret = specific_send_sig_info(sig, info, t);
1186  	spin_unlock_irqrestore(&t->sighand->siglock, flags);
1187  
1188  	return ret;
1189  }

接着走到specific_send_sig_info()

1134  static int
1135  specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1136  {
1137  	return send_signal(sig, info, t, 0);
1138  }

specific_send_sig_info() 接着调用到send_signal()

1082  static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1083  			int group)
1084  {
1085  	int from_ancestor_ns = 0;
1086  
1087  #ifdef CONFIG_PID_NS
1088  	from_ancestor_ns = si_fromuser(info) &&
1089  			   !task_pid_nr_ns(current, task_active_pid_ns(t));
1090  #endif
1091  
1092  	return __send_signal(sig, info, t, group, from_ancestor_ns);
1093  }

send_signal() 也没有太多的处理逻辑，继续调用到__send_signal()，传入的group 参数的值是0。

978  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
979  			int group, int from_ancestor_ns)
980  {
...
989  	if (!prepare_signal(sig, t,  //对特殊信号做相应处理
990  			from_ancestor_ns || (info == SEND_SIG_FORCED)))
991  		goto ret;
992  
993  	pending = group ? &t->signal->shared_pending : &t->pending; //这时group 为0，所以信号是发给这个进程，而不是进程里面的所有线程
...
1073  out_set:
1074  	signalfd_notify(t, sig);  // Deliver the signal to listening signalfd
1075  	sigaddset(&pending->signal, sig);
1076  	complete_signal(sig, t, group); //继续往下走到这里
1077  ret:
1078  	trace_signal_generate(sig, info, t, group, result);
1079  	return ret;
1080  }

__send_signal()里面做了各种各样的check，因为我们这里主要是熟悉这个流程，对于里面的具体细节，不做过多的介绍，需要了解的同学可以自己看源码。

876  static void complete_signal(int sig, struct task_struct *p, int group)
877  {
878  	struct signal_struct *signal = p->signal;
879  	struct task_struct *t;
880  
881  	/*
882  	 * Now find a thread we can wake up to take the signal off the queue.
883  	 *
884  	 * If the main thread wants the signal, it gets first crack.
885  	 * Probably the least surprising to the average bear.
886  	 */
887  	if (wants_signal(sig, p))
888  		t = p;
889  	else if (!group || thread_group_empty(p))
890  		/*
891  		 * There is just one thread and it does not need to be woken.
892  		 * It will dequeue unblocked signals before it runs again.
893  		 */
894  		return;
895  	else {
896  		/*
897  		 * Otherwise try to find a suitable thread.
898  		 */
899  		t = signal->curr_target;
900  		while (!wants_signal(sig, t)) {
901  			t = next_thread(t);
902  			if (t == signal->curr_target)
903  				/*
904  				 * No thread needs to be woken.
905  				 * Any eligible threads will see
906  				 * the signal in the queue soon.
907  				 */
908  				return;
909  		}
910  		signal->curr_target = t;
911  	}
912  
913  	/*
914  	 * Found a killable thread.  If the signal will be fatal,
915  	 * then start taking the whole group down immediately.
916  	 */
917  	if (sig_fatal(p, sig) &&
918  	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
919  	    !sigismember(&t->real_blocked, sig) &&
920  	    (sig == SIGKILL || !t->ptrace)) {
921  		/*
922  		 * This signal will be fatal to the whole group.
923  		 */
924  		if (!sig_kernel_coredump(sig)) {
925  			/*
926  			 * Start a group exit and wake everybody up.
927  			 * This way we don't have other threads
928  			 * running and doing things after a slower
929  			 * thread has the fatal signal pending.
930  			 */
931  			signal->flags = SIGNAL_GROUP_EXIT;
932  			signal->group_exit_code = sig;
933  			signal->group_stop_count = 0;
934  			t = p;
935  			do {
936  				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
937  				sigaddset(&t->pending.signal, SIGKILL);
938  				signal_wake_up(t, 1);
939  			} while_each_thread(p, t);
940  			return;
941  		}
942  	}
943  
944  	/*
945  	 * The signal is already in the shared-pending queue.
946  	 * Tell the chosen thread to wake up and dequeue it.
947  	 */
948  	signal_wake_up(t, sig == SIGKILL);
949  	return;
950  }

complete_signal() 对信号在进行了进一步处理，最后调用到signal_wake_up()

linux-4.10/include/uapi/asm-generic/signal.h

10  #define SIGHUP		 1
11  #define SIGINT		 2
12  #define SIGQUIT		 3
13  #define SIGILL		 4 //执行了非法指令. 通常是因为可执行文件本身出现错误, 或者试图执行数据段. 堆栈溢出时也有可能产生这个信号。
14  #define SIGTRAP		 5
15  #define SIGABRT		 6
16  #define SIGIOT		 6  //调用abort函数生成的信号
17  #define SIGBUS		 7
18  #define SIGFPE		 8
19  #define SIGKILL		 9 //用来立即结束程序的运行. 本信号不能被阻塞、处理和忽略。
20  #define SIGUSR1		10
21  #define SIGSEGV		11 //试图访问未分配给自己的内存, 或试图往没有写权限的内存地址写数据.
22  #define SIGUSR2		12
23  #define SIGPIPE		13  //管道破裂。这个信号通常在进程间通信产生
24  #define SIGALRM		14
25  #define SIGTERM		15
26  #define SIGSTKFLT   	16
27  #define SIGCHLD		17
28  #define SIGCONT		18  //让一个停止(stopped)的进程继续执行. 本信号不能被阻塞.
29  #define SIGSTOP		19
30  #define SIGTSTP		20

上面给出了一些信号对应的值，在实际中，遇到最多的情况就是SIGSEGV，也就是访问了非法地址，应该90%以上是这种情况。

linux-4.10/include/linux/sched.h

3520  static inline void signal_wake_up(struct task_struct *t, bool resume)
3521  {
3522  	signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
3523  }

现在考虑的是SIGSEGV 信号的情况，所以上面传下来的resume 为0，调用到signal_wake_up_state() 传入的第二个参数也是0

linux-4.10/kernel/signal.c

645  void signal_wake_up_state(struct task_struct *t, unsigned int state)
646  {
647  	set_tsk_thread_flag(t, TIF_SIGPENDING);
648  	/*
649  	 * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
650  	 * case. We don't check t->state here because there is a race with it
651  	 * executing another processor and just now entering stopped state.
652  	 * By using wake_up_state, we ensure the process will wake up and
653  	 * handle its death signal.
654  	 */
655  	if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))  
656  		kick_process(t);
657  }

走了一圈，有回到signal.c 这个文件里面的代码，set_tsk_thread_flag()会把thread_info.flags，设置成TIF_SIGPENDING，wake_up_state()和kick_process()就不深入了解了。

linux-4.10/arch/arm64/kernel/entry.S

589  el0_da:
590  	/*
591  	 * Data abort handling
592  	 */
593  	mrs	x26, far_el1
594  	// enable interrupts before calling the main handler
595  	enable_dbg_and_irq
596  	ct_user_exit
597  	bic	x0, x26, #(0xff << 56)
598  	mov	x1, x25
599  	mov	x2, sp
//bl (表示带返回值的跳转) 带链接的跳转。 首先将当前指令的下一条指令地址保存在LR寄存器，然后跳转的lable
//b 表示无条件跳转
600  	bl	do_mem_abort  //do_mem_abort在kernel 走了一圈，返回了
601  	b	ret_to_user  //

重新回到entry.S 的el0_da 代码块，do_mem_abort跑完了之后，会继续往下，跑到ret_to_user

/*
770   * "slow" syscall return path.
771   */
772  ret_to_user:
773  	disable_irq				// disable interrupts
774  	ldr	x1, [tsk, #TSK_TI_FLAGS]  //tsk	.req	x28 current thread_info  x1 = thread_info.flags
775  	and	x2, x1, #_TIF_WORK_MASK  // x2 = x1 & _TIF_WORK_MASK   也就是取出thread_info.flags 的标志位
776  	cbnz	x2, work_pending         // 如果x2 != 0 跳到 work_pending 
777  finish_ret_to_user:
778  	enable_step_tsk x1, x2
779  	kernel_exit 0
780  ENDPROC(ret_to_user)

前面说了set_tsk_thread_flag()会把thread_info.flags设置成TIF_SIGPENDING(增加这个flag)，所以这里会走到work_pending。

758  /*
759   * Ok, we need to do extra processing, enter the slow path.
760   */
761  work_pending:
762  	mov	x0, sp				// 'regs'
763  	bl	do_notify_resume
764  #ifdef CONFIG_TRACE_IRQFLAGS
765  	bl	trace_hardirqs_on		// enabled while in userspace
766  #endif
767  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
768  	b	finish_ret_to_user

继续往下会调用到do_notify_resume()函数

linux-4.10/arch/arm64/kernel/signal.c

402  asmlinkage void do_notify_resume(struct pt_regs *regs,
403  				 unsigned int thread_flags)
404  {
405  	/*
406  	 * The assembly code enters us with IRQs off, but it hasn't
407  	 * informed the tracing code of that for efficiency reasons.
408  	 * Update the trace code with the current status.
409  	 */
410  	trace_hardirqs_off();
411  	do {
412  		if (thread_flags & _TIF_NEED_RESCHED) {
413  			schedule();
414  		} else {
415  			local_irq_enable();
416  
417  			if (thread_flags & _TIF_UPROBE)
418  				uprobe_notify_resume(regs);
419  
420  			if (thread_flags & _TIF_SIGPENDING)  //走的是这里
421  				do_signal(regs);
422  
423  			if (thread_flags & _TIF_NOTIFY_RESUME) {
424  				clear_thread_flag(TIF_NOTIFY_RESUME);
425  				tracehook_notify_resume(regs);
426  			}
427  
428  			if (thread_flags & _TIF_FOREIGN_FPSTATE)
429  				fpsimd_restore_current_state();
430  		}
431  
432  		local_irq_disable();
433  		thread_flags = READ_ONCE(current_thread_info()->flags);
434  	} while (thread_flags & _TIF_WORK_MASK);
435  }

通过前面的了解，我们知道thread_flags & _TIF_SIGPENDING 这个条件是成立的，所以继续走到do_signal(regs);

static void do_signal(struct pt_regs *regs)
332  {
...

366  	/*
367  	 * Get the signal to deliver. When running under ptrace, at this point
368  	 * the debugger may change all of our registers.
369  	 */
370  	if (get_signal(&ksig)) {
...
385  		handle_signal(&ksig, regs);
386  		return;
387  	}
...
399  	restore_saved_sigmask();
400  }

接着会调用handle_signal() 函数

285  /*
286   * OK, we're invoking a handler
287   */
288  static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
289  {
290  	struct task_struct *tsk = current;
291  	sigset_t *oldset = sigmask_to_save();
292  	int usig = ksig->sig;
293  	int ret;
294  
295  	/*
296  	 * Set up the stack frame
297  	 */
298  	if (is_compat_task()) {
299  		if (ksig->ka.sa.sa_flags & SA_SIGINFO)
300  			ret = compat_setup_rt_frame(usig, ksig, oldset, regs);
301  		else
302  			ret = compat_setup_frame(usig, ksig, oldset, regs);
303  	} else {
304  		ret = setup_rt_frame(usig, ksig, oldset, regs);  //走这里
305  	}
306  
307  	/*
308  	 * Check that the resulting registers are actually sane.
309  	 */
310  	ret |= !valid_user_regs(&regs->user_regs, current);
311  
312  	/*
313  	 * Fast forward the stepping logic so we step into the signal
314  	 * handler.
315  	 */
316  	if (!ret)
317  		user_fastforward_single_step(tsk);
318  
319  	signal_setup_done(ret, ksig, 0);
320  }

我们重点看setup_rt_frame()做了什么。

250  static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
251  			  struct pt_regs *regs)
252  {
253  	struct rt_sigframe __user *frame;
254  	int err = 0;
255  
256  	frame = get_sigframe(ksig, regs);  //获取用户空间处理信号的栈
257  	if (!frame)
258  		return 1;
259  
260  	__put_user_error(0, &frame->uc.uc_flags, err);
261  	__put_user_error(NULL, &frame->uc.uc_link, err);
262  
263  	err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
264  	err |= setup_sigframe(frame, regs, set);
265  	if (err == 0) {
266  		setup_return(regs, &ksig->ka, frame, usig);
267  		if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
268  			err |= copy_siginfo_to_user(&frame->info, &ksig->info);
269  			regs->regs[1] = (unsigned long)&frame->info;
270  			regs->regs[2] = (unsigned long)&frame->uc;
271  		}
272  	}
273  
274  	return err;
275  }

setup_rt_frame()会用户空间执行信号处理函数准备好栈和相关参数。

232  static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
233  			 void __user *frame, int usig)
234  {
235  	__sigrestore_t sigtramp;
236  
237  	regs->regs[0] = usig;
238  	regs->sp = (unsigned long)frame;
239  	regs->regs[29] = regs->sp + offsetof(struct rt_sigframe, fp);
240  	regs->pc = (unsigned long)ka->sa.sa_handler; //之前注册的用户空间处理函数
241  
242  	if (ka->sa.sa_flags & SA_RESTORER)
243  		sigtramp = ka->sa.sa_restorer;
244  	else
245  		sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);  
246  
247  	regs->regs[30] = (unsigned long)sigtramp; //用户空间的处理函数执行完后，会调用这个函数再返回内核空间
248  }

regs->pc 设置成之前用户空间传下来的处理函数，所以返回用户空间时会执行处理函数，regs->regs[30]设置里用户空间执行完处理函数后，通过这个函数再返回内核空间。

linux-4.10/arch/arm64/kernel/vdso/vdso.lds.S

/*
96   * Make the sigreturn code visible to the kernel.
97   */
98  VDSO_sigtramp		= __kernel_rt_sigreturn;

linux-4.10/arch/arm64/kernel/vdso/sigreturn.S

28  ENTRY(__kernel_rt_sigreturn)
29  	.cfi_startproc
30  	.cfi_signal_frame
31  	.cfi_def_cfa	x29, 0
32  	.cfi_offset	x29, 0 * 8
33  	.cfi_offset	x30, 1 * 8
34  	mov	x8, #__NR_rt_sigreturn
35  	svc	#0
36  	.cfi_endproc
37  ENDPROC(__kernel_rt_sigreturn)

所以上面的sigtramp 里面插入了一个系统调用__NR_rt_sigreturn，也就是信号处理后返回。

bionic/linker/debugger.cpp

302  __LIBC_HIDDEN__ void debuggerd_init() {
...
306    action.sa_sigaction = debuggerd_signal_handler;
...
321  }

再会到debuggerd_init()中，kernel 将信号发生出来后，经常许多环节的处理，会返回用户空间，调用debuggerd_signal_handler()这个处理函数。

262  static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
...
271    send_debuggerd_packet(info);
...
294    int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);
295    if (rc != 0) {
296      __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297                        strerror(errno));
298      _exit(0);
299    }
300  }

这里会调用send_debuggerd_packet() 向debuggerd进程发生信息。

208  static void send_debuggerd_packet(siginfo_t* info) {
...
226    int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC); //创建socket连接
227    if (s == -1) {
228      __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",
229                        strerror(errno));
230      return;
231    }
232  
233    // debuggerd knows our pid from the credentials on the
234    // local socket but we need to tell it the tid of the crashing thread.
235    // debuggerd will be paranoid and verify that we sent a tid
236    // that's actually in our process.
237    debugger_msg_t msg;
238    msg.action = DEBUGGER_ACTION_CRASH;
239    msg.tid = gettid();  //消息中有tid，也就是出现异常的线程号
240    msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message);
241    msg.original_si_code = (info != nullptr) ? info->si_code : 0;
242    ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg)));  //将消息通过socket 发送给debuggerd
...
255    close(s);
256  }

接着debuggerd 会收到发送过来的消息

system/core/debuggerd/debuggerd.cpp

921  int main(int argc, char** argv) {
922    union selinux_callback cb;
923    if (argc == 1) {
924      cb.func_audit = audit_callback;
925      selinux_set_callback(SELINUX_CB_AUDIT, cb);
926      cb.func_log = selinux_log_callback;
927      selinux_set_callback(SELINUX_CB_LOG, cb);
928      return do_server();
929    }
930  
931    bool dump_backtrace = false;
932    bool have_tid = false;
933    pid_t tid = 0;
934    for (int i = 1; i < argc; i++) {
935      if (!strcmp(argv[i], "-b")) {
936        dump_backtrace = true;
937      } else if (!have_tid) {
938        tid = atoi(argv[i]);
939        have_tid = true;
940      } else {
941        usage();
942        return 1;
943      }
944    }
945    if (!have_tid) {
946      usage();
947      return 1;
948    }
949    return do_explicit_dump(tid, dump_backtrace);
950  }

debuggerd进程运行起来之后，会根据参数走不同的流程，如果是默认的走do_server()，也就是创建一个socket server，等待client端连接，处理相应的请求。另一种情况就是我们手动调用它dump 某个进程的backtrace，前面有讲过，adb shell debuggerd -b pid .

842  static int do_server() {
843    // debuggerd crashes can't be reported to debuggerd.
844    // Reset all of the crash handlers.
845    signal(SIGABRT, SIG_DFL);
846    signal(SIGBUS, SIG_DFL);
847    signal(SIGFPE, SIG_DFL);
848    signal(SIGILL, SIG_DFL);
849    signal(SIGSEGV, SIG_DFL);
850  #ifdef SIGSTKFLT
851    signal(SIGSTKFLT, SIG_DFL);
852  #endif
853    signal(SIGTRAP, SIG_DFL);
854  
855    // Ignore failed writes to closed sockets
856    signal(SIGPIPE, SIG_IGN);
857  
858    // Block SIGCHLD so we can sigtimedwait for it.
859    sigset_t sigchld;
860    sigemptyset(&sigchld);
861    sigaddset(&sigchld, SIGCHLD);
862    sigprocmask(SIG_SETMASK, &sigchld, nullptr);
863  
864    int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT, //创建socket ，充当server端
865                                SOCK_STREAM | SOCK_CLOEXEC);
866    if (s == -1) return 1;
867  
868    // Fork a process that stays root, and listens on a pipe to pause and resume the target.
869    if (!start_signal_sender()) {
870      ALOGE("debuggerd: failed to fork signal sender");
871      return 1;
872    }
873  
874    ALOGI("debuggerd: starting\n");
875  
876    for (;;) {  //循环等待
877      sockaddr_storage ss;
878      sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
879      socklen_t alen = sizeof(ss);
880  
881      ALOGV("waiting for connection\n");
882      int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC); //等待client端来连接
883      if (fd == -1) {
884        ALOGE("accept failed: %s\n", strerror(errno));
885        continue;
886      }
887  
888      handle_request(fd); //处理client端的请求
889    }
890    return 0;
891  }

do_server() 会把debuggerd 自身crash 时的信号屏蔽掉，然后创建socket，充当server，调用accept()等待client端来连接，收到连接后，调用handle_request(fd)处理。

801  static void handle_request(int fd) {
802    ALOGV("handle_request(%d)\n", fd);
803  
804    ScopedFd closer(fd);
805    debugger_request_t request;
806    memset(&request, 0, sizeof(request));
807    int status = read_request(fd, &request); //读取client端发生过来的消息
808    if (status != 0) {
809      return;
810    }
...
831    // Fork a child to handle the rest of the request.
832    pid_t fork_pid = fork();
833    if (fork_pid == -1) {
834      ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
835    } else if (fork_pid == 0) {
836      worker_process(fd, request);  //fork 出子进程来处理
837    } else {
838      monitor_worker_process(fork_pid, request);
839    }
840  }

handle_request()会把client 端，也就是信号处理函数中发生过来的信息读出来，然后创建出一个子进程继续处理。

565  static void worker_process(int fd, debugger_request_t& request) {
...
598    // Attach to the target process.
599    if (!ptrace_attach_thread(request.pid, request.tid)) {  //ptrace 进程
600      ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
601      exit(1);
602    }
...
608    if (request.action == DEBUGGER_ACTION_CRASH) { //通过前面的代码我们知道action是 DEBUGGER_ACTION_CRASH
609      pid_t pid;
610      uid_t uid;
611      gid_t gid;
612      if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {
613        ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);
614        exit(1);
615      }
...
624    }
625  
626    // Don't attach to the sibling threads if we want to attach gdb.
627    // Supposedly, it makes the process less reliable.
628    bool attach_gdb = should_attach_gdb(request);
629    if (attach_gdb) {
630      // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.
631      if (init_getevent() != 0) {
632        ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");
633        attach_gdb = false;
634      }
636    }
...
662    int crash_signal = SIGKILL;
663    succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings, //dump 异常进程的 寄存器 和backtrace 信息
664                             &crash_signal, amfd_data.get());
...
692    for (pid_t sibling : siblings) {
693      ptrace(PTRACE_DETACH, sibling, 0, 0); //ptrace DETACH
694    }
...
717  
718    close(amfd);  //关闭socket连接
719  
720    exit(!succeeded);
721  }

worker_process()中，会PTRACE_ATTACH 上发生异常的进程，然后dump 出进程的信息用于debug，最后PTRACE_DETACH 该进程，关于ptrace 的功能，这里不再介绍，用户空间调用ptrace 实际上是系统调用的接口，真正的实现在kernel中。

483  static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
484                           BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
485                           int* crash_signal, std::string* amfd_data) {
...
492    while (true) {
493      int signal = wait_for_signal(request.tid, &total_sleep_time_usec);
494      switch (signal) {
495        case -1:
496          ALOGE("debuggerd: timed out waiting for signal");
497          return false;
498  
...
517        case SIGABRT:
518        case SIGBUS:
519        case SIGFPE:
520        case SIGILL:
521        case SIGSEGV:
522  #ifdef SIGSTKFLT
523        case SIGSTKFLT:
524  #endif
525        case SIGSYS:
526        case SIGTRAP:
527          ALOGV("stopped -- fatal signal\n");
528          *crash_signal = signal;
529          engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
530                            request.original_si_code, request.abort_msg_address, amfd_data);
531          break;
532  
533        default:
534          ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
535          break;
536      }
537      break;
538    }
539  
540    return true;
541  }

ptrace 上出现异常的进程后，该进程会重新跑起来，还是跑出现异常的那段代码，所以又会发生异常，但是这时候的异常信息不会发生给debuggerd_signal_handler()处理函数，而是给当前的debuggerd进程。debugger进程收到信号后会调用engrave_tombstone().

system/core/debuggerd/tombstone.cpp

688  void engrave_tombstone(int tombstone_fd, BacktraceMap* map, pid_t pid, pid_t tid,
689                         const std::set<pid_t>& siblings, int signal, int original_si_code,
690                         uintptr_t abort_msg_address, std::string* amfd_data) {
691    log_t log;
692    log.current_tid = tid;
693    log.crashed_tid = tid;
694  
695    if (tombstone_fd < 0) {
696      ALOGE("debuggerd: skipping tombstone write, nothing to do.\n");
697      return;
698    }
699  
700    log.tfd = tombstone_fd;
701    log.amfd_data = amfd_data;
702    dump_crash(&log, map, pid, tid, siblings, signal, original_si_code, abort_msg_address);
703  }

走到dump_crash() dump相关信息

607  // Dumps all information about the specified pid to the tombstone.
608  static void dump_crash(log_t* log, BacktraceMap* map, pid_t pid, pid_t tid,
609                         const std::set<pid_t>& siblings, int signal, int si_code,
610                         uintptr_t abort_msg_address) {
611    // don't copy log messages to tombstone unless this is a dev device
612    char value[PROPERTY_VALUE_MAX];
613    property_get("ro.debuggable", value, "0");
614    bool want_logs = (value[0] == '1');
615  
616    _LOG(log, logtype::HEADER,
617         "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
618    dump_header_info(log);
619    dump_thread(log, pid, tid, map, signal, si_code, abort_msg_address, true);
620    if (want_logs) {
621      dump_logs(log, pid, 5);
622    }
623  
624    if (!siblings.empty()) {
625      for (pid_t sibling : siblings) {
626        dump_thread(log, pid, sibling, map, 0, 0, 0, false);
627      }
628    }
629  
630    if (want_logs) {
631      dump_logs(log, pid, 0);
632    }
633  }