system_server crash现象研究
现象:当system server进程crash时,发现zygote进程会被杀掉,此后Zyogote进程和system server被重新启动。
分析:在init解析init.rc时,Zygote进程作为一个服务被定义,且被声明为自动重启。因此一旦Zygote进程退出,则init会收到子进程退出信号从而重新启动zygote服务,进而Zygote启动System Server。同样,在System server被Zygote作为子进程启动后,Zygote通过信号监听该子进程状态,一旦退出Zygote将会杀死自身等待init再次运行。另外system server进程将监听service manager进程,如service manager退出则杀掉自身从而导致zygote被重启。
下面为相关代码:
Zygote启动system server入口:
libcore/dalvik/src/main/java/dalvik/system/Zygote.java
- /**
- * Special method to start the system server process.
- * @deprecated use {@link Zygote#forkSystemServer(int, int, int[], int, int[][])}
- */
- @Deprecated
- public static int forkSystemServer(int uid, int gid, int[] gids,
- boolean enableDebugger, int[][] rlimits) {
- int debugFlags = enableDebugger ? DEBUG_ENABLE_DEBUGGER : 0;
- return forkAndSpecialize(uid, gid, gids, debugFlags, rlimits);
- }
forkAndSpecialize是一个JNI函数,其定义见Dalvik_dalvik_system_Zygote_fork(),在其中注册信号处理函数,在有子进程退出时将检查进程pid,仅当中止的子进程pid为system server时才杀掉本进程(zygote进程)。
dalvik_system_Zygote.c
- /* native public static int fork(); */
- static void Dalvik_dalvik_system_Zygote_fork(const u4* args, JValue* pResult)
- {
- pid_t pid;
- if (!gDvm.zygote) {
- dvmThrowException("Ljava/lang/IllegalStateException;",
- "VM instance not started with -Xzygote");
- RETURN_VOID();
- }
- if (!dvmGcPreZygoteFork()) {
- LOGE("pre-fork heap failed\n");
- dvmAbort();
- }
- setSignalHandler(); //这里注册信号处理,以监测子进程状态
- dvmDumpLoaderStats("zygote");
- pid = fork();
- #ifdef HAVE_ANDROID_OS
- if (pid == 0) {
- /* child process */
- extern int gMallocLeakZygoteChild;
- gMallocLeakZygoteChild = 1;
- }
- #endif
- RETURN_INT(pid);
- }
- /*
- * configure sigchld handler for the zygote process
- * This is configured very late, because earlier in the dalvik lifecycle
- * we can fork() and exec() for the verifier/optimizer, and we
- * want to waitpid() for those rather than have them be harvested immediately.
- *
- * This ends up being called repeatedly before each fork(), but there's
- * no real harm in that.
- */
- static void setSignalHandler()
- {
- int err;
- struct sigaction sa;
- memset(&sa, 0, sizeof(sa));
- sa.sa_handler = sigchldHandler; //信号处理函数地址
- err = sigaction (SIGCHLD, &sa, NULL); //设置子进程中止时的信号处理函数
- if (err < 0) {
- LOGW("Error setting SIGCHLD handler: %s", strerror(errno));
- }
- }
- /*
- * This signal handler is for zygote mode, since the zygote
- * must reap its children
- */
- static void sigchldHandler(int s)
- {
- pid_t pid;
- int status;
- while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { //得到中止的子进程pid
- /* Log process-death status that we care about. In general it is not
- safe to call LOG(...) from a signal handler because of possible
- reentrancy. However, we know a priori that the current implementation
- of LOG() is safe to call from a SIGCHLD handler in the zygote process.
- If the LOG() implementation changes its locking strategy or its use
- of syscalls within the lazy-init critical section, its use here may
- become unsafe. */
- if (WIFEXITED(status)) {
- if (WEXITSTATUS(status)) {
- LOG(LOG_DEBUG, ZYGOTE_LOG_TAG, "Process %d exited cleanly (%d)\n",
- (int) pid, WEXITSTATUS(status));
- } else {
- IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
- LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
- "Process %d exited cleanly (%d)\n",
- (int) pid, WEXITSTATUS(status));
- }
- }
- } else if (WIFSIGNALED(status)) {
- if (WTERMSIG(status) != SIGKILL) {
- LOG(LOG_DEBUG, ZYGOTE_LOG_TAG,
- "Process %d terminated by signal (%d)\n",
- (int) pid, WTERMSIG(status));
- } else {
- IF_LOGV(/*should use ZYGOTE_LOG_TAG*/) {
- LOG(LOG_VERBOSE, ZYGOTE_LOG_TAG,
- "Process %d terminated by signal (%d)\n",
- (int) pid, WTERMSIG(status));
- }
- }
- }
- /*
- * If the just-crashed process is the system_server, bring down zygote
- * so that it is restarted by init and system server will be restarted
- * from there.
- */
- if (pid == gDvm.systemServerPid) { //仅当中止的子进程为system server时才杀掉本进程(zygote进程)
- LOG(LOG_INFO, ZYGOTE_LOG_TAG,
- "Exit zygote because system server (%d) has terminated\n",
- (int) pid);
- kill(getpid(), SIGKILL); //杀掉Zygote进程,将导致system server被init重启
- }
- }
- if (pid < 0) {
- LOG(LOG_WARN, ZYGOTE_LOG_TAG,
- "Zygote SIGCHLD error in waitpid: %s\n",strerror(errno));
- }
- }
在Zygote被杀掉后,即init.rc中下面的service被杀掉:
- service zygote /system/bin/app_process -Xzygote /system/bin --zygote --start-system-server //启动SystemServer
- class zygote_services
- socket zygote stream 666
- onrestart write /sys/android_power/request_state wake
- onrestart write /sys/power/state on
- onrestart restart media
- onrestart restart netd
init进程启动后将进入无限循环以监听init.rc中启动的service状态,如发现有service退出则会重新启动该service。以下为init进程监听子进程的代码:
system/core/init/init.c
- int main(int argc, char **argv)
- {
- int fd_count = 0;
- struct pollfd ufds[4];
- char *tmpdev;
- char* debuggable;
- char tmp[32];
- int property_set_fd_init = 0;
- int signal_fd_init = 0;
- int keychord_fd_init = 0;
- struct rlimit rlim;
- struct rlimit rlim_new;
- if (!strcmp(basename(argv[0]), "ueventd"))
- return ueventd_main(argc, argv);
- /* clear the umask */
- umask(0);
- /* Get the basic filesystem setup we need put
- * together in the initramdisk on / and then we'll
- * let the rc file figure out the rest.
- */
- mkdir("/dev", 0755);
- mkdir("/proc", 0755);
- mkdir("/sys", 0755);
- mount("tmpfs", "/dev", "tmpfs", 0, "mode=0755");
- mkdir("/dev/pts", 0755);
- mkdir("/dev/socket", 0755);
- mount("devpts", "/dev/pts", "devpts", 0, NULL);
- mount("proc", "/proc", "proc", 0, NULL);
- mount("sysfs", "/sys", "sysfs", 0, NULL);
- /* We must have some place other than / to create the
- * device nodes for kmsg and null, otherwise we won't
- * be able to remount / read-only later on.
- * Now that tmpfs is mounted on /dev, we can actually
- * talk to the outside world.
- */
- open_devnull_stdio();
- log_init();
- init_parse_config_file("/init.rc"); //解析文件 /init.rc
- /* pull the kernel commandline and ramdisk properties file in */
- import_kernel_cmdline(0);
- get_hardware_name(hardware, &revision);
- snprintf(tmp, sizeof(tmp), "/init.%s.rc", hardware); //解析文件 /init.%hardware%.rc,如:init.goldfish.rc,应该是放硬件相关的内容
- init_parse_config_file(tmp);
- action_for_each_trigger("early-init", action_add_queue_tail); //action列表中名为early-init的,将此action放在列表尾
- queue_builtin_action(wait_for_coldboot_done_action, "wait_for_coldboot_done");
- queue_builtin_action(property_init_action, "property_init");
- queue_builtin_action(keychord_init_action, "keychord_init");
- queue_builtin_action(console_init_action, "console_init");
- queue_builtin_action(set_init_properties_action, "set_init_properties");
- if (getrlimit(RLIMIT_CORE, &rlim)==0) {
- rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY;
- if (setrlimit(RLIMIT_CORE, &rlim_new)!=0) {
- /* failed. try raising just to the old max */
- rlim_new.rlim_cur = rlim_new.rlim_max = rlim.rlim_max;
- (void) setrlimit(RLIMIT_CORE, &rlim_new);
- }
- }
- /* execute all the boot actions to get us started */
- action_for_each_trigger("init", action_add_queue_tail);
- action_for_each_trigger("early-fs", action_add_queue_tail);
- action_for_each_trigger("fs", action_add_queue_tail);
- action_for_each_trigger("post-fs", action_add_queue_tail);
- queue_builtin_action(property_service_init_action, "property_service_init");
- queue_builtin_action(signal_init_action, "signal_init");
- queue_builtin_action(check_startup_action, "check_startup");
- /* execute all the boot actions to get us started */
- action_for_each_trigger("early-boot", action_add_queue_tail);
- action_for_each_trigger("boot", action_add_queue_tail);
- queue_all_device_triggers();
- execute_one_command();
- device_triggers_enabled = 1;
- /* run all property triggers based on current state of the properties */
- queue_builtin_action(queue_property_triggers_action, "queue_propety_triggers");
- #if BOOTCHART
- queue_builtin_action(bootchart_init_action, "bootchart_init");
- #endif
- for(;;) { //无限循环
- int nr, i, timeout = -1;
- execute_one_command();
- restart_processes(); //检查有无service需要重新启动
- if (!property_set_fd_init && get_property_set_fd() > 0) {
- ufds[fd_count].fd = get_property_set_fd();
- ufds[fd_count].events = POLLIN;
- ufds[fd_count].revents = 0;
- fd_count++;
- property_set_fd_init = 1;
- }
- if (!signal_fd_init && get_signal_fd() > 0) {
- ufds[fd_count].fd = get_signal_fd();
- ufds[fd_count].events = POLLIN;
- ufds[fd_count].revents = 0;
- fd_count++;
- signal_fd_init = 1;
- }
- if (!keychord_fd_init && get_keychord_fd() > 0) {
- ufds[fd_count].fd = get_keychord_fd();
- ufds[fd_count].events = POLLIN;
- ufds[fd_count].revents = 0;
- fd_count++;
- keychord_fd_init = 1;
- }
- if (process_needs_restart) {
- timeout = (process_needs_restart - gettime()) * 1000;
- if (timeout < 0)
- timeout = 0;
- }
- if (!action_queue_empty() || cur_action)
- timeout = 0;
- #if BOOTCHART
- if (bootchart_count > 0) {
- if (timeout < 0 || timeout > BOOTCHART_POLLING_MS)
- timeout = BOOTCHART_POLLING_MS;
- if (bootchart_step() < 0 || --bootchart_count == 0) {
- bootchart_finish();
- bootchart_count = 0;
- }
- }
- #endif
- nr = poll(ufds, fd_count, timeout);
- if (nr <= 0)
- continue;
- for (i = 0; i < fd_count; i++) {
- if (ufds[i].revents == POLLIN) {
- if (ufds[i].fd == get_property_set_fd())
- handle_property_set_fd();
- else if (ufds[i].fd == get_keychord_fd())
- handle_keychord();
- else if (ufds[i].fd == get_signal_fd())
- handle_signal(); //检查中止的子进程
- }
- }
- }
- return 0;
- }
子进程退出处理函数
system/core/init/signal_handler.c
- void handle_signal(void)
- {
- char tmp[32];
- /* we got a SIGCHLD - reap and restart as needed */
- read(signal_recv_fd, tmp, sizeof(tmp)); //为什么读32字节?
- while (!wait_for_one_process(0)) //处理所有中断的子进程
- ;
- }
下面函数得到中止的进程pid并查到对应的Service,然后执行启动该服务前需执行的命令,并置服务标志位的SVC_RESTARTING。在init主函数循环中将根据该标志位启动服务。
- static int wait_for_one_process(int block) //block为0
- {
- pid_t pid;
- int status;
- struct service *svc;
- struct socketinfo *si;
- time_t now;
- struct listnode *node;
- struct command *cmd;
- while ( (pid = waitpid(-1, &status, block ? 0 : WNOHANG)) == -1 && errno == EINTR ); //得到中止的进程pid
- if (pid <= 0) return -1; //无效pid,no action
- svc = service_find_by_pid(pid); //查找pid对应service
- if (!svc) {
- ERROR("untracked pid %d exited\n", pid);
- return 0;
- }
- if (!(svc->flags & SVC_ONESHOT)) { //此Service仅需运行一次
- kill(-pid, SIGKILL);
- NOTICE("process '%s' killing any children in process group\n", svc->name);
- }
- /* remove any sockets we may have created */
- for (si = svc->sockets; si; si = si->next) { //关闭service中所有socket
- char tmp[128];
- snprintf(tmp, sizeof(tmp), ANDROID_SOCKET_DIR"/%s", si->name);
- unlink(tmp);
- }
- svc->pid = 0;
- svc->flags &= (~SVC_RUNNING);
- /* oneshot processes go into the disabled state on exit */
- if (svc->flags & SVC_ONESHOT) {
- svc->flags |= SVC_DISABLED;
- }
- /* disabled processes do not get restarted automatically */
- if (svc->flags & SVC_DISABLED) {
- notify_service_state(svc->name, "stopped");
- return 0;
- }
- now = gettime();
- if (svc->flags & SVC_CRITICAL) { //检查关键服务状态
- if (svc->time_crashed + CRITICAL_CRASH_WINDOW >= now) {
- if (++svc->nr_crashed > CRITICAL_CRASH_THRESHOLD) {
- ERROR("critical process '%s' exited %d times in %d minutes; "
- "rebooting into recovery mode\n", svc->name,
- CRITICAL_CRASH_THRESHOLD, CRITICAL_CRASH_WINDOW / 60);
- sync();
- __reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2,
- LINUX_REBOOT_CMD_RESTART2, "recovery");
- return 0;
- }
- } else {
- svc->time_crashed = now;
- svc->nr_crashed = 1; //为何设为1??
- }
- }
- svc->flags |= SVC_RESTARTING; //置位,以便下次运行restart_processes时启动该服务
- /* Execute all onrestart commands for this service. */
- list_for_each(node, &svc->onrestart.commands) { //运行服务启动前应执行的命令
- cmd = node_to_item(node, struct command, clist);
- cmd->func(cmd->nargs, cmd->args);
- }
- notify_service_state(svc->name, "restarting"); //更新此服务状态属性值为restarting
- return 0;
- }
system/core/init/init.c
- static void restart_processes()
- {
- process_needs_restart = 0;
- service_for_each_flags(SVC_RESTARTING,
- restart_service_if_needed);
- }
- void service_for_each_flags(unsigned matchflags,
- void (*func)(struct service *svc))
- {
- struct listnode *node;
- struct service *svc;
- list_for_each(node, &service_list) {
- svc = node_to_item(node, struct service, slist);
- if (svc->flags & matchflags) { //如果某个service的标志位SVC_RESTARTING置位
- func(svc); //执行函数restart_service_if_needed以启动service
- }
- }
- }
启动service svc
- static void restart_service_if_needed(struct service *svc)
- {
- time_t next_start_time = svc->time_started + 5; //service上次启动的时间增加5秒
- if (next_start_time <= gettime()) { //如果Service上次启动时间距今大于5秒
- svc->flags &= (~SVC_RESTARTING);
- service_start(svc, NULL); //重新启动该service
- return;
- }...
- }
另外,system server进程会监听service manager进程状态。一旦service manager进程退出,system server进程会自动退出:
system_init.cpp
- class GrimReaper : public IBinder::DeathRecipient {
- public:
- GrimReaper() { }
- virtual void binderDied(const wp<IBinder>& who)
- {
- kill(getpid(), SIGKILL); //杀掉自身进程
- }
- };
- } // namespace android
- extern "C" status_t system_init()
- {...
- sp<IServiceManager> sm = defaultServiceManager();
- sp<GrimReaper> grim = new GrimReaper();
- sm->asBinder()->linkToDeath(grim, grim.get(), 0); //监听ServiceManager binder对象