系统调用fork()通过sys_fork()进入do_fork()时,其clone_flags为SIGCHLD,也就是说,所有的标志位均为0,所以copy_files(),copy_fs(),copy_sighand()
以及copy_mm()全都真正执行了,这四项资源全都复制了
vfork()经过sys_vfork()进入do_fork()时,则其clone_flags为VFORK|CLONE_VM|SIGHLD,所以只执行了copy_files()、copy_fs()以及copy_sighand
而copy_mm(),则因标志位CLONE_VM为1,只是通过指针共享其父进程的存储空间,包括用户空间堆栈在内
__clone(),则取决于调用时的参数
32/*
33 * cloning flags:
34 */
35#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
36#define CLONE_VM 0x00000100 /* set if VM shared between processes */
37#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
38#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
39#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
40#define CLONE_PID 0x00001000 /* set if pid shared */
41#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
42#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
43#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
44#define CLONE_THREAD 0x00010000 /* Same thread group? */
45#define CLONE_NEWNS 0x00020000 /* New namespace group? */
46
47#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
710asmlinkage int sys_fork(struct pt_regs regs)
711{
712 return do_fork(SIGCHLD, regs.esp, ®s, 0);
713}
714
715asmlinkage int sys_clone(struct pt_regs regs)
716{
717 unsigned long clone_flags;
718 unsigned long newsp;
719
720 clone_flags = regs.ebx;
721 newsp = regs.ecx;
722 if (!newsp)
723 newsp = regs.esp;
724 return do_fork(clone_flags, newsp, ®s, 0);
725}
726
727/*
728 * This is trivial, and on the face of it looks like it
729 * could equally well be done in user mode.
730 *
731 * Not so, for quite unobvious reasons - register pressure.
732 * In user mode vfork() cannot have a stack frame, and if
733 * done by calling the "clone()" system call directly, you
734 * do not have enough call-clobbered registers to hold all
735 * the information you need.
736 */
737asmlinkage int sys_vfork(struct pt_regs regs)
738{
739 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);
740}
741
546/*
547 * Ok, this is the main fork-routine. It copies the system process
548 * information (task[nr]) and sets up the necessary registers. It also
549 * copies the data segment in its entirety. The "stack_start" and
550 * "stack_top" arguments are simply passed along to the platform
551 * specific copy_thread() routine. Most platforms ignore stack_top.
552 * For an example that's using stack_top, see
553 * arch/ia64/kernel/process.c.
554 */
555int do_fork(unsigned long clone_flags, unsigned long stack_start,
556 struct pt_regs *regs, unsigned long stack_size)
557{
558 int retval = -ENOMEM;
559 struct task_struct *p;
560 DECLARE_MUTEX_LOCKED(sem);
561
562 if (clone_flags & CLONE_PID) {
563 /* This is only allowed from the boot up thread */
564 if (current->pid)
565 return -EPERM;
566 }
567
568 current->vfork_sem = &sem;
569
570 p = alloc_task_struct();
571 if (!p)
572 goto fork_out;
573
574 *p = *current;
575
576 retval = -EAGAIN;
577 if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
578 goto bad_fork_free;
579 atomic_inc(&p->user->__count);
580 atomic_inc(&p->user->processes);
581
582 /*
583 * Counter increases are protected by
584 * the kernel lock so nr_threads can't
585 * increase under us (but it may decrease).
586 */
587 if (nr_threads >= max_threads)
588 goto bad_fork_cleanup_count;
589
590 get_exec_domain(p->exec_domain);
591
592 if (p->binfmt && p->binfmt->module)
593 __MOD_INC_USE_COUNT(p->binfmt->module);
594
595 p->did_exec = 0;
596 p->swappable = 0;
597 p->state = TASK_UNINTERRUPTIBLE;
598
599 copy_flags(clone_flags, p);
600 p->pid = get_pid(clone_flags);
pid设置
601
602 p->run_list.next = NULL;
603 p->run_list.prev = NULL;
运行队列设置
604
605 if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
606 p->p_opptr = current;
设置祖先
607 if (!(p->ptrace & PT_PTRACED))
608 p->p_pptr = current;
设置父进程
609 }
610 p->p_cptr = NULL;
设置子进程
611 init_waitqueue_head(&p->wait_chldexit);
设置子进程等待队列
612 p->vfork_sem = NULL;
613 spin_lock_init(&p->alloc_lock);
614
615 p->sigpending = 0;
616 init_sigpending(&p->pending);
设置待处理信号队列
617
618 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
619 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
620 init_timer(&p->real_timer);
621 p->real_timer.data = (unsigned long) p;
622
623 p->leader = 0; /* session leadership doesn't inherit */
624 p->tty_old_pgrp = 0;
625 p->times.tms_utime = p->times.tms_stime = 0;
626 p->times.tms_cutime = p->times.tms_cstime = 0;
627#ifdef CONFIG_SMP
628 {
629 int i;
630 p->has_cpu = 0;
631 p->processor = current->processor;
632 /* ?? should we just memset this ?? */
633 for(i = 0; i < smp_num_cpus; i++)
634 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
635 spin_lock_init(&p->sigmask_lock);
636 }
637#endif
638 p->lock_depth = -1; /* -1 = no lock */
639 p->start_time = jiffies;
设置进程创建时间
640
641 retval = -ENOMEM;
642 /* copy all the process information */
643 if (copy_files(clone_flags, p))
有条件地复制已打开文件的控制结构
644 goto bad_fork_cleanup;
645 if (copy_fs(clone_flags, p))
646 goto bad_fork_cleanup_files;
647 if (copy_sighand(clone_flags, p))
648 goto bad_fork_cleanup_fs;
649 if (copy_mm(clone_flags, p))
650 goto bad_fork_cleanup_sighand;
651 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
652 if (retval)
653 goto bad_fork_cleanup_sighand;
654 p->semundo = NULL;
655
656 /* Our parent execution domain becomes current domain
657 These must match for thread signalling to apply */
658
659 p->parent_exec_id = p->self_exec_id;
设置父进程的执行域 本进程的执行域
660
661 /* ok, now we should be set up.. */
662 p->swappable = 1;
本进程的存储页面可以被换出
663 p->exit_signal = clone_flags & CSIGNAL;
设置本进程执行exit()时应向父进程发出的信号
664 p->pdeath_signal = 0;
要求父进程在执行exit()时向本进程发出的信号
665
666 /*
667 * "share" dynamic priority between parent and child, thus the
668 * total amount of dynamic priorities in the system doesnt change,
669 * more scheduling fairness. This is only important in the first
670 * timeslice, on the long run the scheduling behaviour is unchanged.
671 */
672 p->counter = (current->counter + 1) >> 1;
进程运行时间配客面,将父进程的时间配额分成两半
673 current->counter >>= 1;
674 if (!current->counter)
675 current->need_resched = 1;
676
677 /*
678 * Ok, add it to the run-queues and make it
679 * visible to the rest of the system.
680 *
681 * Let it rip!
682 */
683 retval = p->pid;
684 p->tgid = retval;
685 INIT_LIST_HEAD(&p->thread_group);
初始线程组
686 write_lock_irq(&tasklist_lock);
687 if (clone_flags & CLONE_THREAD) {
688 p->tgid = current->tgid;
689 list_add(&p->thread_group, ¤t->thread_group);
690 }
691 SET_LINKS(p);
链入内核的进程队列
692 hash_pid(p);
链入杂凑队列
693 nr_threads++;
694 write_unlock_irq(&tasklist_lock);
695
696 if (p->ptrace & PT_PTRACED)
697 send_sig(SIGSTOP, p, 1);
698
699 wake_up_process(p); /* do this last */
唤醒子进程,挂入可执行队列
700 ++total_forks;
701
702fork_out:
703 if ((clone_flags & CLONE_VFORK) && (retval > 0))
704 down(&sem);
705 return retval;
706
707bad_fork_cleanup_sighand:
708 exit_sighand(p);
709bad_fork_cleanup_fs:
710 exit_fs(p); /* blocking */
711bad_fork_cleanup_files:
712 exit_files(p); /* blocking */
713bad_fork_cleanup:
714 put_exec_domain(p->exec_domain);
715 if (p->binfmt && p->binfmt->module)
716 __MOD_DEC_USE_COUNT(p->binfmt->module);
717bad_fork_cleanup_count:
718 atomic_dec(&p->user->processes);
719 free_uid(p->user);
720bad_fork_free:
721 free_task_struct(p);
722 goto fork_out;
723}
以及copy_mm()全都真正执行了,这四项资源全都复制了
vfork()经过sys_vfork()进入do_fork()时,则其clone_flags为VFORK|CLONE_VM|SIGHLD,所以只执行了copy_files()、copy_fs()以及copy_sighand
而copy_mm(),则因标志位CLONE_VM为1,只是通过指针共享其父进程的存储空间,包括用户空间堆栈在内
__clone(),则取决于调用时的参数
32/*
33 * cloning flags:
34 */
35#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
36#define CLONE_VM 0x00000100 /* set if VM shared between processes */
37#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
38#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
39#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
40#define CLONE_PID 0x00001000 /* set if pid shared */
41#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
42#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
43#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
44#define CLONE_THREAD 0x00010000 /* Same thread group? */
45#define CLONE_NEWNS 0x00020000 /* New namespace group? */
46
47#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD)
710asmlinkage int sys_fork(struct pt_regs regs)
711{
712 return do_fork(SIGCHLD, regs.esp, ®s, 0);
713}
714
715asmlinkage int sys_clone(struct pt_regs regs)
716{
717 unsigned long clone_flags;
718 unsigned long newsp;
719
720 clone_flags = regs.ebx;
721 newsp = regs.ecx;
722 if (!newsp)
723 newsp = regs.esp;
724 return do_fork(clone_flags, newsp, ®s, 0);
725}
726
727/*
728 * This is trivial, and on the face of it looks like it
729 * could equally well be done in user mode.
730 *
731 * Not so, for quite unobvious reasons - register pressure.
732 * In user mode vfork() cannot have a stack frame, and if
733 * done by calling the "clone()" system call directly, you
734 * do not have enough call-clobbered registers to hold all
735 * the information you need.
736 */
737asmlinkage int sys_vfork(struct pt_regs regs)
738{
739 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0);
740}
741
546/*
547 * Ok, this is the main fork-routine. It copies the system process
548 * information (task[nr]) and sets up the necessary registers. It also
549 * copies the data segment in its entirety. The "stack_start" and
550 * "stack_top" arguments are simply passed along to the platform
551 * specific copy_thread() routine. Most platforms ignore stack_top.
552 * For an example that's using stack_top, see
553 * arch/ia64/kernel/process.c.
554 */
555int do_fork(unsigned long clone_flags, unsigned long stack_start,
556 struct pt_regs *regs, unsigned long stack_size)
557{
558 int retval = -ENOMEM;
559 struct task_struct *p;
560 DECLARE_MUTEX_LOCKED(sem);
561
562 if (clone_flags & CLONE_PID) {
563 /* This is only allowed from the boot up thread */
564 if (current->pid)
565 return -EPERM;
566 }
567
568 current->vfork_sem = &sem;
569
570 p = alloc_task_struct();
571 if (!p)
572 goto fork_out;
573
574 *p = *current;
575
576 retval = -EAGAIN;
577 if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
578 goto bad_fork_free;
579 atomic_inc(&p->user->__count);
580 atomic_inc(&p->user->processes);
581
582 /*
583 * Counter increases are protected by
584 * the kernel lock so nr_threads can't
585 * increase under us (but it may decrease).
586 */
587 if (nr_threads >= max_threads)
588 goto bad_fork_cleanup_count;
589
590 get_exec_domain(p->exec_domain);
591
592 if (p->binfmt && p->binfmt->module)
593 __MOD_INC_USE_COUNT(p->binfmt->module);
594
595 p->did_exec = 0;
596 p->swappable = 0;
597 p->state = TASK_UNINTERRUPTIBLE;
598
599 copy_flags(clone_flags, p);
600 p->pid = get_pid(clone_flags);
pid设置
601
602 p->run_list.next = NULL;
603 p->run_list.prev = NULL;
运行队列设置
604
605 if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
606 p->p_opptr = current;
设置祖先
607 if (!(p->ptrace & PT_PTRACED))
608 p->p_pptr = current;
设置父进程
609 }
610 p->p_cptr = NULL;
设置子进程
611 init_waitqueue_head(&p->wait_chldexit);
设置子进程等待队列
612 p->vfork_sem = NULL;
613 spin_lock_init(&p->alloc_lock);
614
615 p->sigpending = 0;
616 init_sigpending(&p->pending);
设置待处理信号队列
617
618 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
619 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
620 init_timer(&p->real_timer);
621 p->real_timer.data = (unsigned long) p;
622
623 p->leader = 0; /* session leadership doesn't inherit */
624 p->tty_old_pgrp = 0;
625 p->times.tms_utime = p->times.tms_stime = 0;
626 p->times.tms_cutime = p->times.tms_cstime = 0;
627#ifdef CONFIG_SMP
628 {
629 int i;
630 p->has_cpu = 0;
631 p->processor = current->processor;
632 /* ?? should we just memset this ?? */
633 for(i = 0; i < smp_num_cpus; i++)
634 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
635 spin_lock_init(&p->sigmask_lock);
636 }
637#endif
638 p->lock_depth = -1; /* -1 = no lock */
639 p->start_time = jiffies;
设置进程创建时间
640
641 retval = -ENOMEM;
642 /* copy all the process information */
643 if (copy_files(clone_flags, p))
有条件地复制已打开文件的控制结构
644 goto bad_fork_cleanup;
645 if (copy_fs(clone_flags, p))
646 goto bad_fork_cleanup_files;
647 if (copy_sighand(clone_flags, p))
648 goto bad_fork_cleanup_fs;
649 if (copy_mm(clone_flags, p))
650 goto bad_fork_cleanup_sighand;
651 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
652 if (retval)
653 goto bad_fork_cleanup_sighand;
654 p->semundo = NULL;
655
656 /* Our parent execution domain becomes current domain
657 These must match for thread signalling to apply */
658
659 p->parent_exec_id = p->self_exec_id;
设置父进程的执行域 本进程的执行域
660
661 /* ok, now we should be set up.. */
662 p->swappable = 1;
本进程的存储页面可以被换出
663 p->exit_signal = clone_flags & CSIGNAL;
设置本进程执行exit()时应向父进程发出的信号
664 p->pdeath_signal = 0;
要求父进程在执行exit()时向本进程发出的信号
665
666 /*
667 * "share" dynamic priority between parent and child, thus the
668 * total amount of dynamic priorities in the system doesnt change,
669 * more scheduling fairness. This is only important in the first
670 * timeslice, on the long run the scheduling behaviour is unchanged.
671 */
672 p->counter = (current->counter + 1) >> 1;
进程运行时间配客面,将父进程的时间配额分成两半
673 current->counter >>= 1;
674 if (!current->counter)
675 current->need_resched = 1;
676
677 /*
678 * Ok, add it to the run-queues and make it
679 * visible to the rest of the system.
680 *
681 * Let it rip!
682 */
683 retval = p->pid;
684 p->tgid = retval;
685 INIT_LIST_HEAD(&p->thread_group);
初始线程组
686 write_lock_irq(&tasklist_lock);
687 if (clone_flags & CLONE_THREAD) {
688 p->tgid = current->tgid;
689 list_add(&p->thread_group, ¤t->thread_group);
690 }
691 SET_LINKS(p);
链入内核的进程队列
692 hash_pid(p);
链入杂凑队列
693 nr_threads++;
694 write_unlock_irq(&tasklist_lock);
695
696 if (p->ptrace & PT_PTRACED)
697 send_sig(SIGSTOP, p, 1);
698
699 wake_up_process(p); /* do this last */
唤醒子进程,挂入可执行队列
700 ++total_forks;
701
702fork_out:
703 if ((clone_flags & CLONE_VFORK) && (retval > 0))
704 down(&sem);
705 return retval;
706
707bad_fork_cleanup_sighand:
708 exit_sighand(p);
709bad_fork_cleanup_fs:
710 exit_fs(p); /* blocking */
711bad_fork_cleanup_files:
712 exit_files(p); /* blocking */
713bad_fork_cleanup:
714 put_exec_domain(p->exec_domain);
715 if (p->binfmt && p->binfmt->module)
716 __MOD_DEC_USE_COUNT(p->binfmt->module);
717bad_fork_cleanup_count:
718 atomic_dec(&p->user->processes);
719 free_uid(p->user);
720bad_fork_free:
721 free_task_struct(p);
722 goto fork_out;
723}
87static int get_pid(unsigned long flags)
88{
89 static int next_safe = PID_MAX;
90 struct task_struct *p;
91 int pid, beginpid;
92
93 if (flags & CLONE_PID)
94 return current->pid;
95
96 spin_lock(&lastpid_lock);
97 beginpid = last_pid;
98 if((++last_pid) & 0xffff8000) {
99 last_pid = 300; /* Skip daemons etc. */
100 goto inside;
101 }
102 if(last_pid >= next_safe) {
103inside:
104 next_safe = PID_MAX;
105 read_lock(&tasklist_lock);
106 repeat:
107 for_each_task(p) {
108 if(p->pid == last_pid ||
109 p->pgrp == last_pid ||
110 p->tgid == last_pid ||
111 p->session == last_pid) {
112 if(++last_pid >= next_safe) {
113 if(last_pid & 0xffff8000)
114 last_pid = 300;
115 next_safe = PID_MAX;
116 }
117 if(unlikely(last_pid == beginpid)) {
118 next_safe = 0;
119 goto nomorepids;
120 }
121 goto repeat;
122 }
123 if(p->pid > last_pid && next_safe > p->pid)
124 next_safe = p->pid;
125 if(p->pgrp > last_pid && next_safe > p->pgrp)
126 next_safe = p->pgrp;
127 if(p->tgid > last_pid && next_safe > p->tgid)
128 next_safe = p->tgid;
129 if(p->session > last_pid && next_safe > p->session)
130 next_safe = p->session;
131 }
132 read_unlock(&tasklist_lock);
133 }
134 pid = last_pid;
135 spin_unlock(&lastpid_lock);
136
137 return pid;
138
139nomorepids:
140 read_unlock(&tasklist_lock);
141 spin_unlock(&lastpid_lock);
142 return 0;
143}
复制已打开文件的控制结构:
436static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
437{
438 struct files_struct *oldf, *newf;
439 struct file **old_fds, **new_fds;
440 int open_files, nfds, size, i, error = 0;
441
442 /*
443 * A background process may not have any files ...
444 */
445 oldf = current->files;
446 if (!oldf)
447 goto out;
448
449 if (clone_flags & CLONE_FILES) {
450 atomic_inc(&oldf->count);
父子进程共享该数据结构,该数据结构多了一个用户
451 goto out;
452 }
453
454 /*
455 * Note: we may be using current for both targets (See exec.c)
456 * This works because we cache current->files (old) as oldf. Don't
457 * break this.
458 */
459 tsk->files = NULL;
460 error = -ENOMEM;
461 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
为子进程分配一个files_struct数据结构
462 if (!newf)
463 goto out;
464
465 atomic_set(&newf->count, 1);
466
467 newf->file_lock = RW_LOCK_UNLOCKED;
468 newf->next_fd = 0;
469 newf->max_fds = NR_OPEN_DEFAULT;
470 newf->max_fdset = __FD_SETSIZE;
471 newf->close_on_exec = &newf->close_on_exec_init;
472 newf->open_fds = &newf->open_fds_init;
473 newf->fd = &newf->fd_array[0];
474
475 /* We don't yet have the oldf readlock, but even if the old
476 fdset gets grown now, we'll only copy up to "size" fds */
477 size = oldf->max_fdset;
已经打开文件的数量
478 if (size > __FD_SETSIZE) {
479 newf->max_fdset = 0;
480 write_lock(&newf->file_lock);
481 error = expand_fdset(newf, size-1);
扩展容量
482 write_unlock(&newf->file_lock);
483 if (error)
484 goto out_release;
485 }
486 read_lock(&oldf->file_lock);
487
488 open_files = count_open_files(oldf, size);
489
490 /*
491 * Check whether we need to allocate a larger fd array.
492 * Note: we're not a clone task, so the open count won't
493 * change.
494 */
495 nfds = NR_OPEN_DEFAULT;
496 if (open_files > nfds) {
497 read_unlock(&oldf->file_lock);
498 newf->max_fds = 0;
499 write_lock(&newf->file_lock);
500 error = expand_fd_array(newf, open_files-1);
501 write_unlock(&newf->file_lock);
502 if (error)
503 goto out_release;
504 nfds = newf->max_fds;
505 read_lock(&oldf->file_lock);
506 }
507
508 old_fds = oldf->fd;
509 new_fds = newf->fd;
510
511 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
512 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
513
514 for (i = open_files; i != 0; i--) {
515 struct file *f = *old_fds++;
516 if (f) {
517 get_file(f);
518 } else {
519 /*
520 * The fd may be claimed in the fd bitmap but not yet
521 * instantiated in the files array if a sibling thread
522 * is partway through open(). So make sure that this
523 * fd is available to the new process.
524 */
525 FD_CLR(open_files - i, newf->open_fds);
526 }
527 *new_fds++ = f;
528 }
529 read_unlock(&oldf->file_lock);
530
531 /* compute the remainder to be cleared */
532 size = (newf->max_fds - open_files) * sizeof(struct file *);
533
534 /* This is long word aligned thus could use a optimized version */
535 memset(new_fds, 0, size);
536
537 if (newf->max_fdset > open_files) {
538 int left = (newf->max_fdset-open_files)/8;
539 int start = open_files / (8 * sizeof(unsigned long));
540
541 memset(&newf->open_fds->fds_bits[start], 0, left);
542 memset(&newf->close_on_exec->fds_bits[start], 0, left);
543 }
544
545 tsk->files = newf;
546 error = 0;
547out:
548 return error;
549
550out_release:
551 free_fdset (newf->close_on_exec, newf->max_fdset);
552 free_fdset (newf->open_fds, newf->max_fdset);
553 kmem_cache_free(files_cachep, newf);
554 goto out;
555}
411static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
412{
413 if (clone_flags & CLONE_FS) {
414 atomic_inc(¤t->fs->count);
415 return 0;
416 }
417 tsk->fs = __copy_fs_struct(current->fs);
418 if (!tsk->fs)
419 return -1;
420 return 0;
421}
406struct fs_struct *copy_fs_struct(struct fs_struct *old)
407{
408 return __copy_fs_struct(old);
409}
381static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
382{
383 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
384 /* We don't need to lock fs - think why ;-) */
385 if (fs) {
386 atomic_set(&fs->count, 1);
387 fs->lock = RW_LOCK_UNLOCKED;
388 fs->umask = old->umask;
389 read_lock(&old->lock);
390 fs->rootmnt = mntget(old->rootmnt);
391 fs->root = dget(old->root);
392 fs->pwdmnt = mntget(old->pwdmnt);
393 fs->pwd = dget(old->pwd);
394 if (old->altroot) {
395 fs->altrootmnt = mntget(old->altrootmnt);
396 fs->altroot = dget(old->altroot);
397 } else {
398 fs->altrootmnt = NULL;
399 fs->altroot = NULL;
400 }
401 read_unlock(&old->lock);
402 }
403 return fs;
404}
584static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
585{
586 struct signal_struct *sig;
587
588 if (clone_flags & CLONE_SIGHAND) {
589 atomic_inc(¤t->sig->count);
590 return 0;
591 }
592 sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
593 tsk->sig = sig;
594 if (!sig)
595 return -1;
596 spin_lock_init(&sig->siglock);
597 atomic_set(&sig->count, 1);
598 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
599 return 0;
600}
318static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
319{
320 struct mm_struct * mm, *oldmm;
321 int retval;
322
323 tsk->min_flt = tsk->maj_flt = 0;
324 tsk->cmin_flt = tsk->cmaj_flt = 0;
325 tsk->nswap = tsk->cnswap = 0;
326
327 tsk->mm = NULL;
328 tsk->active_mm = NULL;
329
330 /*
331 * Are we cloning a kernel thread?
332 *
333 * We need to steal a active VM for that..
334 */
335 oldmm = current->mm;
336 if (!oldmm)
337 return 0;
338
339 if (clone_flags & CLONE_VM) {
340 atomic_inc(&oldmm->mm_users);
341 mm = oldmm;
342 goto good_mm;
343 }
344
345 retval = -ENOMEM;
346 mm = allocate_mm();
347 if (!mm)
348 goto fail_nomem;
349
350 /* Copy the current MM stuff.. */
351 memcpy(mm, oldmm, sizeof(*mm));
352 if (!mm_init(mm))
353 goto fail_nomem;
354
355 if (init_new_context(tsk,mm))
356 goto free_pt;
357
358 down_write(&oldmm->mmap_sem);
359 retval = dup_mmap(mm);
深度复制
360 up_write(&oldmm->mmap_sem);
361
362 if (retval)
363 goto free_pt;
364
365 /*
366 * child gets a private LDT (if there was an LDT in the parent)
367 */
368 copy_segments(tsk, mm);
369
370good_mm:
371 tsk->mm = mm;
372 tsk->active_mm = mm;
373 return 0;
374
375free_pt:
376 mmput(mm);
377fail_nomem:
378 return retval;
379}
145static inline int dup_mmap(struct mm_struct * mm)
146{
147 struct vm_area_struct * mpnt, *tmp, **pprev;
148 int retval;
149
150 flush_cache_mm(current->mm);
151 mm->locked_vm = 0;
152 mm->mmap = NULL;
153 mm->mmap_cache = NULL;
154 mm->map_count = 0;
155 mm->rss = 0;
156 mm->cpu_vm_mask = 0;
157 mm->swap_address = 0;
158 pprev = &mm->mmap;
159
160 /*
161 * Add it to the mmlist after the parent.
162 * Doing it this way means that we can order the list,
163 * and fork() won't mess up the ordering significantly.
164 * Add it first so that swapoff can see any swap entries.
165 */
166 spin_lock(&mmlist_lock);
167 list_add(&mm->mmlist, ¤t->mm->mmlist);
168 mmlist_nr++;
169 spin_unlock(&mmlist_lock);
170
171 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
172 struct file *file;
173
174 retval = -ENOMEM;
175 if(mpnt->vm_flags & VM_DONTCOPY)
176 continue;
177 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
178 if (!tmp)
179 goto fail_nomem;
180 *tmp = *mpnt;
181 tmp->vm_flags &= ~VM_LOCKED;
182 tmp->vm_mm = mm;
183 tmp->vm_next = NULL;
184 file = tmp->vm_file;
185 if (file) {
186 struct inode *inode = file->f_dentry->d_inode;
187 get_file(file);
188 if (tmp->vm_flags & VM_DENYWRITE)
189 atomic_dec(&inode->i_writecount);
190
191 /* insert tmp into the share list, just after mpnt */
192 spin_lock(&inode->i_mapping->i_shared_lock);
193 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
194 mpnt->vm_next_share->vm_pprev_share =
195 &tmp->vm_next_share;
196 mpnt->vm_next_share = tmp;
197 tmp->vm_pprev_share = &mpnt->vm_next_share;
198 spin_unlock(&inode->i_mapping->i_shared_lock);
199 }
200
201 /*
202 * Link in the new vma and copy the page table entries:
203 * link in first so that swapoff can see swap entries.
204 */
205 spin_lock(&mm->page_table_lock);
206 *pprev = tmp;
207 pprev = &tmp->vm_next;
208 mm->map_count++;
209 retval = copy_page_range(mm, current->mm, tmp);
210 spin_unlock(&mm->page_table_lock);
211
212 if (tmp->vm_ops && tmp->vm_ops->open)
213 tmp->vm_ops->open(tmp);
214
215 if (retval)
216 goto fail_nomem;
217 }
218 retval = 0;
219 build_mmap_rb(mm);
220
221fail_nomem:
222 flush_tlb_mm(current->mm);
223 return retval;
224}
144/*
145 * copy one vm_area from one task to the other. Assumes the page tables
146 * already present in the new task to be cleared in the whole range
147 * covered by this vma.
148 *
149 * 08Jan98 Merged into one routine from several inline routines to reduce
150 * variable count and make things faster. -jj
151 */
152int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
153 struct vm_area_struct *vma)
154{
155 pgd_t * src_pgd, * dst_pgd;
156 unsigned long address = vma->vm_start;
157 unsigned long end = vma->vm_end;
158 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
159
160 src_pgd = pgd_offset(src, address)-1;
161 dst_pgd = pgd_offset(dst, address)-1;
162
163 for (;;) {
对页面目录项的循环
164 pmd_t * src_pmd, * dst_pmd;
165
166 src_pgd++; dst_pgd++;
167
168 /* copy_pmd_range */
169
170 if (pgd_none(*src_pgd))
171 goto skip_copy_pmd_range;
172 if (pgd_bad(*src_pgd)) {
173 pgd_ERROR(*src_pgd);
174 pgd_clear(src_pgd);
175skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
176 if (!address || (address >= end))
177 goto out;
178 continue;
179 }
180 if (pgd_none(*dst_pgd)) {
181 if (!pmd_alloc(dst_pgd, 0))
182 goto nomem;
183 }
184
185 src_pmd = pmd_offset(src_pgd, address);
186 dst_pmd = pmd_offset(dst_pgd, address);
187
188 do {
对中间目录项的循环
189 pte_t * src_pte, * dst_pte;
190
191 /* copy_pte_range */
192
193 if (pmd_none(*src_pmd))
194 goto skip_copy_pte_range;
195 if (pmd_bad(*src_pmd)) {
196 pmd_ERROR(*src_pmd);
197 pmd_clear(src_pmd);
198skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
199 if (address >= end)
200 goto out;
201 goto cont_copy_pmd_range;
202 }
203 if (pmd_none(*dst_pmd)) {
204 if (!pte_alloc(dst_pmd, 0))
205 goto nomem;
206 }
207
208 src_pte = pte_offset(src_pmd, address);
209 dst_pte = pte_offset(dst_pmd, address);
210
211 do {
对页面表的循环
212 pte_t pte = *src_pte;
213 struct page *ptepage;
214
215 /* copy_one_pte */
216
217 if (pte_none(pte))
页面映射尚未建立
218 goto cont_copy_pte_range_noset;
219 if (!pte_present(pte)) {
页面内容在交换设备上
220 swap_duplicate(pte_to_swp_entry(pte));
递增共享计数
221 goto cont_copy_pte_range;
222 }
223 ptepage = pte_page(pte);
224 if ((!VALID_PAGE(ptepage)) ||
225 PageReserved(ptepage))
无效内存页面的处理,(外设接口卡,或内核保留的页面)
226 goto cont_copy_pte_range;
227
228 /* If it's a COW mapping, write protect it both in the parent and the child */
229 if (cow) {
230 ptep_set_wrprotect(src_pte);
将父进程的页面表项改成写保护
231 pte = *src_pte;
232 }
233
234 /* If it's a shared mapping, mark it clean in the child */
235 if (vma->vm_flags & VM_SHARED)
236 pte = pte_mkclean(pte);
把已经改成写保护的表项设置到子进程的页面表中
237 pte = pte_mkold(pte);
238 get_page(ptepage);
239
240cont_copy_pte_range: set_pte(dst_pte, pte);
241cont_copy_pte_range_noset: address += PAGE_SIZE;
242 if (address >= end)
243 goto out;
244 src_pte++;
245 dst_pte++;
246 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
247
248cont_copy_pmd_range: src_pmd++;
249 dst_pmd++;
250 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
251 }
252out:
253 return 0;
254
255nomem:
256 return -ENOMEM;
257}
499/*
500 * we do not have to muck with descriptors here, that is
501 * done in switch_mm() as needed.
502 */
503void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
504{
505 struct mm_struct * old_mm;
506 void *old_ldt, *ldt;
507
508 ldt = NULL;
509 old_mm = current->mm;
510 if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
511 /*
512 * Completely new LDT, we initialize it from the parent:
513 */
514 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
515 if (!ldt)
516 printk(KERN_WARNING "ldt allocation failed/n");
517 else
518 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
519 }
520 new_mm->context.segments = ldt;
521}
529int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
530 unsigned long unused,
531 struct task_struct * p, struct pt_regs * regs)
532{
533 struct pt_regs * childregs;
534
535 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
获得子进程系统空间堆栈中pt_regs结构的指针
536 struct_cpy(childregs, regs);
537 childregs->eax = 0;
设置子进程返回值
538 childregs->esp = esp;
设置进程在用户空间的堆栈位置
539
540 p->thread.esp = (unsigned long) childregs;
调整子进程系统空间堆栈中pt_regs结构的起始地址
541 p->thread.esp0 = (unsigned long) (childregs+1);
调整子进程系统空间堆栈的栈顶
542
543 p->thread.eip = (unsigned long) ret_from_fork;
进程下一次被切换进入运行时的切入点,设置成 ret_from_fork,使得子进程在首次被调度运行时就从这儿开始
544
545 savesegment(fs,p->thread.fs);
把当前段寄存器fs的值保存在p->thread.fs中
546 savesegment(gs,p->thread.gs);
547
548 unlazy_fpu(current);
549 struct_cpy(&p->thread.i387, ¤t->thread.i387);
550
551 return 0;
552}
88{
89 static int next_safe = PID_MAX;
90 struct task_struct *p;
91 int pid, beginpid;
92
93 if (flags & CLONE_PID)
94 return current->pid;
95
96 spin_lock(&lastpid_lock);
97 beginpid = last_pid;
98 if((++last_pid) & 0xffff8000) {
99 last_pid = 300; /* Skip daemons etc. */
100 goto inside;
101 }
102 if(last_pid >= next_safe) {
103inside:
104 next_safe = PID_MAX;
105 read_lock(&tasklist_lock);
106 repeat:
107 for_each_task(p) {
108 if(p->pid == last_pid ||
109 p->pgrp == last_pid ||
110 p->tgid == last_pid ||
111 p->session == last_pid) {
112 if(++last_pid >= next_safe) {
113 if(last_pid & 0xffff8000)
114 last_pid = 300;
115 next_safe = PID_MAX;
116 }
117 if(unlikely(last_pid == beginpid)) {
118 next_safe = 0;
119 goto nomorepids;
120 }
121 goto repeat;
122 }
123 if(p->pid > last_pid && next_safe > p->pid)
124 next_safe = p->pid;
125 if(p->pgrp > last_pid && next_safe > p->pgrp)
126 next_safe = p->pgrp;
127 if(p->tgid > last_pid && next_safe > p->tgid)
128 next_safe = p->tgid;
129 if(p->session > last_pid && next_safe > p->session)
130 next_safe = p->session;
131 }
132 read_unlock(&tasklist_lock);
133 }
134 pid = last_pid;
135 spin_unlock(&lastpid_lock);
136
137 return pid;
138
139nomorepids:
140 read_unlock(&tasklist_lock);
141 spin_unlock(&lastpid_lock);
142 return 0;
143}
复制已打开文件的控制结构:
436static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
437{
438 struct files_struct *oldf, *newf;
439 struct file **old_fds, **new_fds;
440 int open_files, nfds, size, i, error = 0;
441
442 /*
443 * A background process may not have any files ...
444 */
445 oldf = current->files;
446 if (!oldf)
447 goto out;
448
449 if (clone_flags & CLONE_FILES) {
450 atomic_inc(&oldf->count);
父子进程共享该数据结构,该数据结构多了一个用户
451 goto out;
452 }
453
454 /*
455 * Note: we may be using current for both targets (See exec.c)
456 * This works because we cache current->files (old) as oldf. Don't
457 * break this.
458 */
459 tsk->files = NULL;
460 error = -ENOMEM;
461 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
为子进程分配一个files_struct数据结构
462 if (!newf)
463 goto out;
464
465 atomic_set(&newf->count, 1);
466
467 newf->file_lock = RW_LOCK_UNLOCKED;
468 newf->next_fd = 0;
469 newf->max_fds = NR_OPEN_DEFAULT;
470 newf->max_fdset = __FD_SETSIZE;
471 newf->close_on_exec = &newf->close_on_exec_init;
472 newf->open_fds = &newf->open_fds_init;
473 newf->fd = &newf->fd_array[0];
474
475 /* We don't yet have the oldf readlock, but even if the old
476 fdset gets grown now, we'll only copy up to "size" fds */
477 size = oldf->max_fdset;
已经打开文件的数量
478 if (size > __FD_SETSIZE) {
479 newf->max_fdset = 0;
480 write_lock(&newf->file_lock);
481 error = expand_fdset(newf, size-1);
扩展容量
482 write_unlock(&newf->file_lock);
483 if (error)
484 goto out_release;
485 }
486 read_lock(&oldf->file_lock);
487
488 open_files = count_open_files(oldf, size);
489
490 /*
491 * Check whether we need to allocate a larger fd array.
492 * Note: we're not a clone task, so the open count won't
493 * change.
494 */
495 nfds = NR_OPEN_DEFAULT;
496 if (open_files > nfds) {
497 read_unlock(&oldf->file_lock);
498 newf->max_fds = 0;
499 write_lock(&newf->file_lock);
500 error = expand_fd_array(newf, open_files-1);
501 write_unlock(&newf->file_lock);
502 if (error)
503 goto out_release;
504 nfds = newf->max_fds;
505 read_lock(&oldf->file_lock);
506 }
507
508 old_fds = oldf->fd;
509 new_fds = newf->fd;
510
511 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
512 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
513
514 for (i = open_files; i != 0; i--) {
515 struct file *f = *old_fds++;
516 if (f) {
517 get_file(f);
518 } else {
519 /*
520 * The fd may be claimed in the fd bitmap but not yet
521 * instantiated in the files array if a sibling thread
522 * is partway through open(). So make sure that this
523 * fd is available to the new process.
524 */
525 FD_CLR(open_files - i, newf->open_fds);
526 }
527 *new_fds++ = f;
528 }
529 read_unlock(&oldf->file_lock);
530
531 /* compute the remainder to be cleared */
532 size = (newf->max_fds - open_files) * sizeof(struct file *);
533
534 /* This is long word aligned thus could use a optimized version */
535 memset(new_fds, 0, size);
536
537 if (newf->max_fdset > open_files) {
538 int left = (newf->max_fdset-open_files)/8;
539 int start = open_files / (8 * sizeof(unsigned long));
540
541 memset(&newf->open_fds->fds_bits[start], 0, left);
542 memset(&newf->close_on_exec->fds_bits[start], 0, left);
543 }
544
545 tsk->files = newf;
546 error = 0;
547out:
548 return error;
549
550out_release:
551 free_fdset (newf->close_on_exec, newf->max_fdset);
552 free_fdset (newf->open_fds, newf->max_fdset);
553 kmem_cache_free(files_cachep, newf);
554 goto out;
555}
411static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
412{
413 if (clone_flags & CLONE_FS) {
414 atomic_inc(¤t->fs->count);
415 return 0;
416 }
417 tsk->fs = __copy_fs_struct(current->fs);
418 if (!tsk->fs)
419 return -1;
420 return 0;
421}
406struct fs_struct *copy_fs_struct(struct fs_struct *old)
407{
408 return __copy_fs_struct(old);
409}
381static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
382{
383 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
384 /* We don't need to lock fs - think why ;-) */
385 if (fs) {
386 atomic_set(&fs->count, 1);
387 fs->lock = RW_LOCK_UNLOCKED;
388 fs->umask = old->umask;
389 read_lock(&old->lock);
390 fs->rootmnt = mntget(old->rootmnt);
391 fs->root = dget(old->root);
392 fs->pwdmnt = mntget(old->pwdmnt);
393 fs->pwd = dget(old->pwd);
394 if (old->altroot) {
395 fs->altrootmnt = mntget(old->altrootmnt);
396 fs->altroot = dget(old->altroot);
397 } else {
398 fs->altrootmnt = NULL;
399 fs->altroot = NULL;
400 }
401 read_unlock(&old->lock);
402 }
403 return fs;
404}
584static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
585{
586 struct signal_struct *sig;
587
588 if (clone_flags & CLONE_SIGHAND) {
589 atomic_inc(¤t->sig->count);
590 return 0;
591 }
592 sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
593 tsk->sig = sig;
594 if (!sig)
595 return -1;
596 spin_lock_init(&sig->siglock);
597 atomic_set(&sig->count, 1);
598 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
599 return 0;
600}
318static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
319{
320 struct mm_struct * mm, *oldmm;
321 int retval;
322
323 tsk->min_flt = tsk->maj_flt = 0;
324 tsk->cmin_flt = tsk->cmaj_flt = 0;
325 tsk->nswap = tsk->cnswap = 0;
326
327 tsk->mm = NULL;
328 tsk->active_mm = NULL;
329
330 /*
331 * Are we cloning a kernel thread?
332 *
333 * We need to steal a active VM for that..
334 */
335 oldmm = current->mm;
336 if (!oldmm)
337 return 0;
338
339 if (clone_flags & CLONE_VM) {
340 atomic_inc(&oldmm->mm_users);
341 mm = oldmm;
342 goto good_mm;
343 }
344
345 retval = -ENOMEM;
346 mm = allocate_mm();
347 if (!mm)
348 goto fail_nomem;
349
350 /* Copy the current MM stuff.. */
351 memcpy(mm, oldmm, sizeof(*mm));
352 if (!mm_init(mm))
353 goto fail_nomem;
354
355 if (init_new_context(tsk,mm))
356 goto free_pt;
357
358 down_write(&oldmm->mmap_sem);
359 retval = dup_mmap(mm);
深度复制
360 up_write(&oldmm->mmap_sem);
361
362 if (retval)
363 goto free_pt;
364
365 /*
366 * child gets a private LDT (if there was an LDT in the parent)
367 */
368 copy_segments(tsk, mm);
369
370good_mm:
371 tsk->mm = mm;
372 tsk->active_mm = mm;
373 return 0;
374
375free_pt:
376 mmput(mm);
377fail_nomem:
378 return retval;
379}
145static inline int dup_mmap(struct mm_struct * mm)
146{
147 struct vm_area_struct * mpnt, *tmp, **pprev;
148 int retval;
149
150 flush_cache_mm(current->mm);
151 mm->locked_vm = 0;
152 mm->mmap = NULL;
153 mm->mmap_cache = NULL;
154 mm->map_count = 0;
155 mm->rss = 0;
156 mm->cpu_vm_mask = 0;
157 mm->swap_address = 0;
158 pprev = &mm->mmap;
159
160 /*
161 * Add it to the mmlist after the parent.
162 * Doing it this way means that we can order the list,
163 * and fork() won't mess up the ordering significantly.
164 * Add it first so that swapoff can see any swap entries.
165 */
166 spin_lock(&mmlist_lock);
167 list_add(&mm->mmlist, ¤t->mm->mmlist);
168 mmlist_nr++;
169 spin_unlock(&mmlist_lock);
170
171 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
172 struct file *file;
173
174 retval = -ENOMEM;
175 if(mpnt->vm_flags & VM_DONTCOPY)
176 continue;
177 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
178 if (!tmp)
179 goto fail_nomem;
180 *tmp = *mpnt;
181 tmp->vm_flags &= ~VM_LOCKED;
182 tmp->vm_mm = mm;
183 tmp->vm_next = NULL;
184 file = tmp->vm_file;
185 if (file) {
186 struct inode *inode = file->f_dentry->d_inode;
187 get_file(file);
188 if (tmp->vm_flags & VM_DENYWRITE)
189 atomic_dec(&inode->i_writecount);
190
191 /* insert tmp into the share list, just after mpnt */
192 spin_lock(&inode->i_mapping->i_shared_lock);
193 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
194 mpnt->vm_next_share->vm_pprev_share =
195 &tmp->vm_next_share;
196 mpnt->vm_next_share = tmp;
197 tmp->vm_pprev_share = &mpnt->vm_next_share;
198 spin_unlock(&inode->i_mapping->i_shared_lock);
199 }
200
201 /*
202 * Link in the new vma and copy the page table entries:
203 * link in first so that swapoff can see swap entries.
204 */
205 spin_lock(&mm->page_table_lock);
206 *pprev = tmp;
207 pprev = &tmp->vm_next;
208 mm->map_count++;
209 retval = copy_page_range(mm, current->mm, tmp);
210 spin_unlock(&mm->page_table_lock);
211
212 if (tmp->vm_ops && tmp->vm_ops->open)
213 tmp->vm_ops->open(tmp);
214
215 if (retval)
216 goto fail_nomem;
217 }
218 retval = 0;
219 build_mmap_rb(mm);
220
221fail_nomem:
222 flush_tlb_mm(current->mm);
223 return retval;
224}
144/*
145 * copy one vm_area from one task to the other. Assumes the page tables
146 * already present in the new task to be cleared in the whole range
147 * covered by this vma.
148 *
149 * 08Jan98 Merged into one routine from several inline routines to reduce
150 * variable count and make things faster. -jj
151 */
152int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
153 struct vm_area_struct *vma)
154{
155 pgd_t * src_pgd, * dst_pgd;
156 unsigned long address = vma->vm_start;
157 unsigned long end = vma->vm_end;
158 unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
159
160 src_pgd = pgd_offset(src, address)-1;
161 dst_pgd = pgd_offset(dst, address)-1;
162
163 for (;;) {
对页面目录项的循环
164 pmd_t * src_pmd, * dst_pmd;
165
166 src_pgd++; dst_pgd++;
167
168 /* copy_pmd_range */
169
170 if (pgd_none(*src_pgd))
171 goto skip_copy_pmd_range;
172 if (pgd_bad(*src_pgd)) {
173 pgd_ERROR(*src_pgd);
174 pgd_clear(src_pgd);
175skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
176 if (!address || (address >= end))
177 goto out;
178 continue;
179 }
180 if (pgd_none(*dst_pgd)) {
181 if (!pmd_alloc(dst_pgd, 0))
182 goto nomem;
183 }
184
185 src_pmd = pmd_offset(src_pgd, address);
186 dst_pmd = pmd_offset(dst_pgd, address);
187
188 do {
对中间目录项的循环
189 pte_t * src_pte, * dst_pte;
190
191 /* copy_pte_range */
192
193 if (pmd_none(*src_pmd))
194 goto skip_copy_pte_range;
195 if (pmd_bad(*src_pmd)) {
196 pmd_ERROR(*src_pmd);
197 pmd_clear(src_pmd);
198skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK;
199 if (address >= end)
200 goto out;
201 goto cont_copy_pmd_range;
202 }
203 if (pmd_none(*dst_pmd)) {
204 if (!pte_alloc(dst_pmd, 0))
205 goto nomem;
206 }
207
208 src_pte = pte_offset(src_pmd, address);
209 dst_pte = pte_offset(dst_pmd, address);
210
211 do {
对页面表的循环
212 pte_t pte = *src_pte;
213 struct page *ptepage;
214
215 /* copy_one_pte */
216
217 if (pte_none(pte))
页面映射尚未建立
218 goto cont_copy_pte_range_noset;
219 if (!pte_present(pte)) {
页面内容在交换设备上
220 swap_duplicate(pte_to_swp_entry(pte));
递增共享计数
221 goto cont_copy_pte_range;
222 }
223 ptepage = pte_page(pte);
224 if ((!VALID_PAGE(ptepage)) ||
225 PageReserved(ptepage))
无效内存页面的处理,(外设接口卡,或内核保留的页面)
226 goto cont_copy_pte_range;
227
228 /* If it's a COW mapping, write protect it both in the parent and the child */
229 if (cow) {
230 ptep_set_wrprotect(src_pte);
将父进程的页面表项改成写保护
231 pte = *src_pte;
232 }
233
234 /* If it's a shared mapping, mark it clean in the child */
235 if (vma->vm_flags & VM_SHARED)
236 pte = pte_mkclean(pte);
把已经改成写保护的表项设置到子进程的页面表中
237 pte = pte_mkold(pte);
238 get_page(ptepage);
239
240cont_copy_pte_range: set_pte(dst_pte, pte);
241cont_copy_pte_range_noset: address += PAGE_SIZE;
242 if (address >= end)
243 goto out;
244 src_pte++;
245 dst_pte++;
246 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
247
248cont_copy_pmd_range: src_pmd++;
249 dst_pmd++;
250 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
251 }
252out:
253 return 0;
254
255nomem:
256 return -ENOMEM;
257}
499/*
500 * we do not have to muck with descriptors here, that is
501 * done in switch_mm() as needed.
502 */
503void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
504{
505 struct mm_struct * old_mm;
506 void *old_ldt, *ldt;
507
508 ldt = NULL;
509 old_mm = current->mm;
510 if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
511 /*
512 * Completely new LDT, we initialize it from the parent:
513 */
514 ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
515 if (!ldt)
516 printk(KERN_WARNING "ldt allocation failed/n");
517 else
518 memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
519 }
520 new_mm->context.segments = ldt;
521}
529int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
530 unsigned long unused,
531 struct task_struct * p, struct pt_regs * regs)
532{
533 struct pt_regs * childregs;
534
535 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
获得子进程系统空间堆栈中pt_regs结构的指针
536 struct_cpy(childregs, regs);
537 childregs->eax = 0;
设置子进程返回值
538 childregs->esp = esp;
设置进程在用户空间的堆栈位置
539
540 p->thread.esp = (unsigned long) childregs;
调整子进程系统空间堆栈中pt_regs结构的起始地址
541 p->thread.esp0 = (unsigned long) (childregs+1);
调整子进程系统空间堆栈的栈顶
542
543 p->thread.eip = (unsigned long) ret_from_fork;
进程下一次被切换进入运行时的切入点,设置成 ret_from_fork,使得子进程在首次被调度运行时就从这儿开始
544
545 savesegment(fs,p->thread.fs);
把当前段寄存器fs的值保存在p->thread.fs中
546 savesegment(gs,p->thread.gs);
547
548 unlazy_fpu(current);
549 struct_cpy(&p->thread.i387, ¤t->thread.i387);
550
551 return 0;
552}
本文详细探讨了Linux系统中进程复制机制,特别是fork(), vfork() 和__clone()系统调用的具体实现细节。通过分析不同系统调用如何设置clone_flags标志来决定哪些资源会被复制,如文件描述符、文件系统信息、信号处理程序以及内存映射。
1613

被折叠的 条评论
为什么被折叠?



