1、do_fork
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
return _do_fork(clone_flags, stack_start, stack_size,
parent_tidptr, child_tidptr, 0);
}
2、参数以及clone_flags
参数 clone_flags 创建的标志位,stack_start和stack_size设置新的进程的栈,parent_tidptr和child_tidptr用于存放新进程父进程的pid和新进程本身的pid
其中clone_flags存在如下的设置:
#define CLONE_VM 0x00000100 // 共享内存描述符和所有页表
#define CLONE_FS 0x00000200 //共享根目录和当前所在目录的表
#define CLONE_FILES 0x00000400 //共享打开的文件描述符表
#define CLONE_SIGHAND 0x00000800 //共享信号处理程序数据、挂起数据、阻塞信号数据
#define CLONE_PTRACE 0x00002000 //如果父进程被跟踪那么子进程也被跟踪
#define CLONE_VFORK 0x00004000 //vfork系统调用使用
#define CLONE_PARENT 0x00008000 //设置新进程得父进程为调用者的父进程,其实就是将新的进程设置调用者的兄弟进程
#define CLONE_THREAD 0x00010000 //将新的进程插入到父进程所在的同一线程组,并强制设置子进程共享父进程的信号描述符。同时也设置子进程的tgid和group_leader字段
#define CLONE_NEWNS 0x00020000 //新的进程需要自己的命名空间时
#define CLONE_SYSVSEM 0x00040000 //共享ipc取消信号量的操作
#define CLONE_SETTLS 0x00080000 //需要为新的进程创建新的tls
#define CLONE_PARENT_SETTID 0x00100000 //把子进程的pid写入由ptid参数指向的父进程的用户态变量
#define CLONE_CHILD_CLEARTID 0x00200000 //如果设置了该标志,则内核会建立一种触发机制,在子进程要退出会在执行新的程序,内核将清除ctid参数指向的父进程的用户态变量,并且唤醒等待这个事件的所有进程。
#define CLONE_DETACHED //无用标志
#define CLONE_UNTRACED 0x00800000 //设置了此标志,则CLONE_PTRACE 标志会不起作用
#define CLONE_CHILD_SETTID 0x01000000 //把子进程的pid写入ctid参数所指向的父进程的用户态变量中
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
3、_do_fork 函数
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}//根据传入参数判断报告跟踪类型
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls);//这个是主要执行创建的函数
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) { //copy_process 执行成功
struct completion vfork;
struct pid *pid;
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID); //根据task获取对应的pid结构体,这个结构体之前介绍过这里不在展开
nr = pid_vnr(pid); //也就是获得pid
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr); //将新建进程的pid写入到用户态变量parent_tidptr
if (clone_flags & CLONE_VFORK) {//如果指定了CLONE_VFORK标记则初始化vfork_done
p->vfork_done = &vfork;
init_completion(&vfork); //初始化完成信号量
get_task_struct(p); //增加task引用计数
}
wake_up_new_task(p); //唤醒刚创建的进程
/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
put_pid(pid);
} else {
nr = PTR_ERR(p); //copy_process执行出错,返回对应的错误码
}
return nr;
}
4、copy_process函数
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls)
{
int retval;
struct task_struct *p;
void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
//上述进行一些冲突标志的检查
retval = security_task_create(clone_flags); //进程创建的钩子函数
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current); //新分配的一个task_struct和thread_info结构体,内容是复制于current的task_struct
if (!p)
goto fork_out;
ftrace_graph_init_task(p);
rt_mutex_init_task(p); //新建结构体的自旋锁初始化
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}//
current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
retval = -EAGAIN;
if (nr_threads >= max_threads)//两个全局变量控制系统内最大线程数量
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); //清除PF_SUPERPRIV 、 PF_WQ_WORKER标志
p->flags |= PF_FORKNOEXEC; //设置没有执行exec标志
INIT_LIST_HEAD(&p->children); //初始化新创建的结构体task_struct数据的children链表
INIT_LIST_HEAD(&p->sibling); //初始化新创建的结构体task_struct数据的sibling链表
rcu_copy_process(p); //初始化一些rcu数据
p->vfork_done = NULL; //p->vfork_done初始化为NULL
spin_lock_init(&p->alloc_lock); //初始化自旋锁
init_sigpending(&p->pending); //初始化新建结构体的sigpending类型的pending
p->utime = p->stime = p->gtime = 0; //清空新建进程的所有运行时间统计
p->utimescaled = p->stimescaled = 0;
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
p->vtime_snap_whence = VTIME_SLEEPING;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
p->start_time = ktime_get_ns(); //设置新建进程的start_time
p->real_start_time = ktime_get_boot_ns(); //获取新建进程的boot_ns
p->io_context = NULL;
p->audit_context = NULL;
threadgroup_change_begin(current);
cgroup_fork(p);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p); //调度相关内容初始化
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p); //用于perf event 初始化
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p); //为新创建的task分配audit上下文
if (retval)
goto bad_fork_cleanup_perf;
shm_init_task(p); //初始化shm链表
retval = copy_semundo(clone_flags, p);//拷贝sysvsem
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_files(clone_flags, p); //如果设置了CLONE_FILES标志则直接增加引用计数共享,否则分配一个files_struct结构体且拷贝父进程打开的文件描述符
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p); //设置了CLONE_FS则增加fs_struct引用计数共享父进程的fs_struct,否则新分配fs_struct然后将值从父进程的fs_struct拷贝过来
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p); //如果没有设置CLONE_SIGHAND则新分配sighand然后拷贝父进程的action
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p); //新分配一个signal_struct结构体,初始化里面的成员然后拷贝父进程的rlim资源限制数据。
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p); //如果设置了CLONE_VM则增加计数共享数据结构,否则为新进程新分配个mm_struct结构,拷贝父进程的mm结构成员。然后初始化新分配的mm_struct结构体。
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p); // 如果指定了ns共享标志则增加引用计数然后共享,否则会新分配个namespace
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);//拷贝io资源,如果设置了共享标志位则直接使用
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);//拷贝thread以及tls存储,这里下面会继续分析
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
}
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) { //如果设置了CLONE_THREAD标志则需要设置组leader和tgid
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p, cgrp_ss_priv);
if (retval)
goto bad_fork_free_pid;
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
spin_lock(¤t->sighand->siglock);
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending(); //判断是否存在挂起信号,如果有则设置对应的标志位
if (signal_pending(current)) {//如果存在挂起信号标志则出错返回
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
atomic_inc(¤t->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p, cgrp_ss_priv);
threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
return p;
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
}
5、copy_thread_tls 函数
static inline int copy_thread_tls(
unsigned long clone_flags, unsigned long sp, unsigned long arg,
struct task_struct *p, unsigned long tls)
{
return copy_thread(clone_flags, sp, arg, p);
}
copy_thread 函数
int copy_thread(unsigned long clone_flags, unsigned long stack_start,
unsigned long stk_sz, struct task_struct *p)
{
struct pt_regs *childregs = task_pt_regs(p); //获取新建进程的ptregs地址
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); //清空新建进程的thread里的cpu上下文
if (likely(!(p->flags & PF_KTHREAD))) { //用户线程
*childregs = *current_pt_regs();
childregs->regs[0] = 0; //清0第一个寄存器,arm中通常将x0作为函数返回值返回
/*
* Read the current TLS pointer from tpidr_el0 as it may be
* out-of-sync with the saved value.
*/
asm("mrs %0, tpidr_el0" : "=r" (*task_user_tls(p))); //将 tpidr_el0寄存器值存储在p->thread.tp_value
if (stack_start) { //指定了用户栈起始地址
if (is_compat_thread(task_thread_info(p)))
childregs->compat_sp = stack_start;
/* 16-byte aligned stack mandatory on AArch64 */
else if (stack_start & 15)
return -EINVAL;
else
childregs->sp = stack_start; //用户栈起始地址
}
/*
* If a TLS pointer was passed to clone (4th argument), use it
* for the new thread.
*/
if (clone_flags & CLONE_SETTLS)
p->thread.tp_value = childregs->regs[3];
} else {
memset(childregs, 0, sizeof(struct pt_regs));
childregs->pstate = PSR_MODE_EL1h;
p->thread.cpu_context.x19 = stack_start; //设置内核线程执行函数
p->thread.cpu_context.x20 = stk_sz;//设置内核线程执行函数的参数
}
p->thread.cpu_context.pc = (unsigned long)ret_from_fork; //设置硬件上下文pc地址
p->thread.cpu_context.sp = (unsigned long)childregs; //设置硬件上下文栈地址
ptrace_hw_copy_thread(p);
return 0;
}