内核源码学习：进程创建解释了(sys_fork sys_clone sys_vfork的区别，也就是vfork pthread_create fork的区别)

最新推荐文章于 2023-11-15 15:12:37 发布

转载最新推荐文章于 2023-11-15 15:12:37 发布 · 2.1k 阅读

文章标签：

#struct #dst #thread #null #signal #domain

kernel 专栏收录该内容

245 篇文章

订阅专栏

本文详细探讨了Linux系统中进程复制机制，特别是fork(), vfork() 和__clone()系统调用的具体实现细节。通过分析不同系统调用如何设置clone_flags标志来决定哪些资源会被复制，如文件描述符、文件系统信息、信号处理程序以及内存映射。

系统调用fork()通过sys_fork()进入do_fork()时，其clone_flags为SIGCHLD,也就是说，所有的标志位均为0，所以copy_files()，copy_fs()，copy_sighand()
        以及copy_mm()全都真正执行了，这四项资源全都复制了
vfork()经过sys_vfork()进入do_fork()时，则其clone_flags为VFORK|CLONE_VM|SIGHLD，所以只执行了copy_files()、copy_fs()以及copy_sighand
        而copy_mm()，则因标志位CLONE_VM为1，只是通过指针共享其父进程的存储空间，包括用户空间堆栈在内
__clone()，则取决于调用时的参数

32/*
33 * cloning flags:
34 */
35#define CSIGNAL         0x000000ff      /* signal mask to be sent at exit */
36#define CLONE_VM        0x00000100      /* set if VM shared between processes */
37#define CLONE_FS        0x00000200      /* set if fs info shared between processes */
38#define CLONE_FILES     0x00000400      /* set if open files shared between processes */
39#define CLONE_SIGHAND   0x00000800      /* set if signal handlers and blocked signals shared */
40#define CLONE_PID       0x00001000      /* set if pid shared */
41#define CLONE_PTRACE    0x00002000      /* set if we want to let tracing continue on the child too */
42#define CLONE_VFORK     0x00004000      /* set if the parent wants the child to wake it up on mm_release */
43#define CLONE_PARENT    0x00008000      /* set if we want to have the same parent as the cloner */
44#define CLONE_THREAD    0x00010000      /* Same thread group? */
45#define CLONE_NEWNS     0x00020000      /* New namespace group? */
46
47#define CLONE_SIGNAL    (CLONE_SIGHAND | CLONE_THREAD)

710asmlinkage int sys_fork(struct pt_regs regs)
711{
712        return do_fork(SIGCHLD, regs.esp, &regs, 0);
713}
714
715asmlinkage int sys_clone(struct pt_regs regs)
716{
717        unsigned long clone_flags;
718        unsigned long newsp;
719
720        clone_flags = regs.ebx;
721        newsp = regs.ecx;
722        if (!newsp)
723                newsp = regs.esp;
724        return do_fork(clone_flags, newsp, &regs, 0);
725}
726
727/*
728 * This is trivial, and on the face of it looks like it
729 * could equally well be done in user mode.
730 *
731 * Not so, for quite unobvious reasons - register pressure.
732 * In user mode vfork() cannot have a stack frame, and if
733 * done by calling the "clone()" system call directly, you
734 * do not have enough call-clobbered registers to hold all
735 * the information you need.
736 */
737asmlinkage int sys_vfork(struct pt_regs regs)
738{
739        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0);
740}
741

546/*
547 * Ok, this is the main fork-routine. It copies the system process
548 * information (task[nr]) and sets up the necessary registers. It also
549 * copies the data segment in its entirety. The "stack_start" and
550 * "stack_top" arguments are simply passed along to the platform
551 * specific copy_thread() routine. Most platforms ignore stack_top.
552 * For an example that's using stack_top, see
553 * arch/ia64/kernel/process.c.
554 */
555int do_fork(unsigned long clone_flags, unsigned long stack_start,
556            struct pt_regs *regs, unsigned long stack_size)
557{
558        int retval = -ENOMEM;
559        struct task_struct *p;
560        DECLARE_MUTEX_LOCKED(sem);
561
562        if (clone_flags & CLONE_PID) {
563                /* This is only allowed from the boot up thread */
564                if (current->pid)
565                        return -EPERM;
566        }
567
568        current->vfork_sem = &sem;
569
570        p = alloc_task_struct();
571        if (!p)
572                goto fork_out;
573
574        *p = *current;
575
576        retval = -EAGAIN;
577        if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
578                goto bad_fork_free;
579        atomic_inc(&p->user->__count);
580        atomic_inc(&p->user->processes);
581
582        /*
583         * Counter increases are protected by
584         * the kernel lock so nr_threads can't
585         * increase under us (but it may decrease).
586         */
587        if (nr_threads >= max_threads)
588                goto bad_fork_cleanup_count;
589
590        get_exec_domain(p->exec_domain);
591
592        if (p->binfmt && p->binfmt->module)
593                __MOD_INC_USE_COUNT(p->binfmt->module);
594
595        p->did_exec = 0;
596        p->swappable = 0;
597        p->state = TASK_UNINTERRUPTIBLE;
598
599        copy_flags(clone_flags, p);
600        p->pid = get_pid(clone_flags);
        pid设置
601
602        p->run_list.next = NULL;
603        p->run_list.prev = NULL;
        运行队列设置
604
605        if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
606                p->p_opptr = current;
                设置祖先
607                if (!(p->ptrace & PT_PTRACED))
608                        p->p_pptr = current;
                    设置父进程
609        }
610        p->p_cptr = NULL;
        设置子进程
611        init_waitqueue_head(&p->wait_chldexit);
        设置子进程等待队列
612        p->vfork_sem = NULL;
613        spin_lock_init(&p->alloc_lock);
614
615        p->sigpending = 0;
616        init_sigpending(&p->pending);
        设置待处理信号队列
617
618        p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
619        p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
620        init_timer(&p->real_timer);
621        p->real_timer.data = (unsigned long) p;
622
623        p->leader = 0;          /* session leadership doesn't inherit */
624        p->tty_old_pgrp = 0;
625        p->times.tms_utime = p->times.tms_stime = 0;
626        p->times.tms_cutime = p->times.tms_cstime = 0;
627#ifdef CONFIG_SMP
628        {
629                int i;
630                p->has_cpu = 0;
631                p->processor = current->processor;
632                /* ?? should we just memset this ?? */
633                for(i = 0; i < smp_num_cpus; i++)
634                        p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
635                spin_lock_init(&p->sigmask_lock);
636        }
637#endif
638        p->lock_depth = -1;             /* -1 = no lock */
639        p->start_time = jiffies;
        设置进程创建时间
640
641        retval = -ENOMEM;
642        /* copy all the process information */
643        if (copy_files(clone_flags, p))
        有条件地复制已打开文件的控制结构
644                goto bad_fork_cleanup;
645        if (copy_fs(clone_flags, p))
646                goto bad_fork_cleanup_files;
647        if (copy_sighand(clone_flags, p))
648                goto bad_fork_cleanup_fs;
649        if (copy_mm(clone_flags, p))
650                goto bad_fork_cleanup_sighand;
651        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
652        if (retval)
653                goto bad_fork_cleanup_sighand;
654        p->semundo = NULL;
655
656        /* Our parent execution domain becomes current domain
657           These must match for thread signalling to apply */
658
659        p->parent_exec_id = p->self_exec_id;
    设置父进程的执行域   本进程的执行域
660
661        /* ok, now we should be set up.. */
662        p->swappable = 1;
    本进程的存储页面可以被换出
663        p->exit_signal = clone_flags & CSIGNAL;
    设置本进程执行exit()时应向父进程发出的信号
664        p->pdeath_signal = 0;
    要求父进程在执行exit()时向本进程发出的信号
665
666        /*
667         * "share" dynamic priority between parent and child, thus the
668         * total amount of dynamic priorities in the system doesnt change,
669         * more scheduling fairness. This is only important in the first
670         * timeslice, on the long run the scheduling behaviour is unchanged.
671         */
672        p->counter = (current->counter + 1) >> 1;
    进程运行时间配客面，将父进程的时间配额分成两半
673        current->counter >>= 1;
674        if (!current->counter)
675                current->need_resched = 1;
676
677        /*
678         * Ok, add it to the run-queues and make it
679         * visible to the rest of the system.
680         *
681         * Let it rip!
682         */
683        retval = p->pid;
684        p->tgid = retval;
685        INIT_LIST_HEAD(&p->thread_group);
    初始线程组
686        write_lock_irq(&tasklist_lock);
687        if (clone_flags & CLONE_THREAD) {
688                p->tgid = current->tgid;
689                list_add(&p->thread_group, &current->thread_group);
690        }
691        SET_LINKS(p);
    链入内核的进程队列
692        hash_pid(p);
    链入杂凑队列
693        nr_threads++;
694        write_unlock_irq(&tasklist_lock);
695
696        if (p->ptrace & PT_PTRACED)
697                send_sig(SIGSTOP, p, 1);
698
699        wake_up_process(p);             /* do this last */
    唤醒子进程，挂入可执行队列
700        ++total_forks;
701
702fork_out:
703        if ((clone_flags & CLONE_VFORK) && (retval > 0))
704                down(&sem);
705        return retval;
706
707bad_fork_cleanup_sighand:
708        exit_sighand(p);
709bad_fork_cleanup_fs:
710        exit_fs(p); /* blocking */
711bad_fork_cleanup_files:
712        exit_files(p); /* blocking */
713bad_fork_cleanup:
714        put_exec_domain(p->exec_domain);
715        if (p->binfmt && p->binfmt->module)
716                __MOD_DEC_USE_COUNT(p->binfmt->module);
717bad_fork_cleanup_count:
718        atomic_dec(&p->user->processes);
719        free_uid(p->user);
720bad_fork_free:
721        free_task_struct(p);
722        goto fork_out;
723}

87static int get_pid(unsigned long flags)
88{
89        static int next_safe = PID_MAX;
90        struct task_struct *p;
91        int pid, beginpid;
92
93        if (flags & CLONE_PID)
94                return current->pid;
95
96        spin_lock(&lastpid_lock);
97        beginpid = last_pid;
98        if((++last_pid) & 0xffff8000) {
99                last_pid = 300;         /* Skip daemons etc. */
100                goto inside;
101        }
102        if(last_pid >= next_safe) {
103inside:
104                next_safe = PID_MAX;
105                read_lock(&tasklist_lock);
106        repeat:
107                for_each_task(p) {
108                        if(p->pid == last_pid   ||
109                           p->pgrp == last_pid ||
110                           p->tgid == last_pid ||
111                           p->session == last_pid) {
112                                if(++last_pid >= next_safe) {
113                                        if(last_pid & 0xffff8000)
114                                                last_pid = 300;
115                                        next_safe = PID_MAX;
116                                }
117                                if(unlikely(last_pid == beginpid)) {
118                                        next_safe = 0;
119                                        goto nomorepids;
120                                }
121                                goto repeat;
122                        }
123                        if(p->pid > last_pid && next_safe > p->pid)
124                                next_safe = p->pid;
125                        if(p->pgrp > last_pid && next_safe > p->pgrp)
126                                next_safe = p->pgrp;
127                        if(p->tgid > last_pid && next_safe > p->tgid)
128                                next_safe = p->tgid;
129                        if(p->session > last_pid && next_safe > p->session)
130                                next_safe = p->session;
131                }
132                read_unlock(&tasklist_lock);
133        }
134        pid = last_pid;
135        spin_unlock(&lastpid_lock);
136
137        return pid;
138
139nomorepids:
140        read_unlock(&tasklist_lock);
141        spin_unlock(&lastpid_lock);
142        return 0;
143}
复制已打开文件的控制结构：
436static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
437{
438        struct files_struct *oldf, *newf;
439        struct file **old_fds, **new_fds;
440        int open_files, nfds, size, i, error = 0;
441
442        /*
443         * A background process may not have any files ...
444         */
445        oldf = current->files;
446        if (!oldf)
447                goto out;
448
449        if (clone_flags & CLONE_FILES) {
450                atomic_inc(&oldf->count);
                父子进程共享该数据结构，该数据结构多了一个用户
451                goto out;
452        }
453
454        /*
455         * Note: we may be using current for both targets (See exec.c)
456         * This works because we cache current->files (old) as oldf. Don't
457         * break this.
458         */
459        tsk->files = NULL;
460        error = -ENOMEM;
461        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
        为子进程分配一个files_struct数据结构
462        if (!newf)
463                goto out;
464
465        atomic_set(&newf->count, 1);
466
467        newf->file_lock     = RW_LOCK_UNLOCKED;
468        newf->next_fd       = 0;
469        newf->max_fds       = NR_OPEN_DEFAULT;
470        newf->max_fdset     = __FD_SETSIZE;
471        newf->close_on_exec = &newf->close_on_exec_init;
472        newf->open_fds      = &newf->open_fds_init;
473        newf->fd            = &newf->fd_array[0];
474
475        /* We don't yet have the oldf readlock, but even if the old
476           fdset gets grown now, we'll only copy up to "size" fds */
477        size = oldf->max_fdset;
        已经打开文件的数量
478        if (size > __FD_SETSIZE) {
479                newf->max_fdset = 0;
480                write_lock(&newf->file_lock);
481                error = expand_fdset(newf, size-1);
                扩展容量
482                write_unlock(&newf->file_lock);
483                if (error)
484                        goto out_release;
485        }
486        read_lock(&oldf->file_lock);
487
488        open_files = count_open_files(oldf, size);
489
490        /*
491         * Check whether we need to allocate a larger fd array.
492         * Note: we're not a clone task, so the open count won't
493         * change.
494         */
495        nfds = NR_OPEN_DEFAULT;
496        if (open_files > nfds) {
497                read_unlock(&oldf->file_lock);
498                newf->max_fds = 0;
499                write_lock(&newf->file_lock);
500                error = expand_fd_array(newf, open_files-1);
501                write_unlock(&newf->file_lock);
502                if (error)
503                        goto out_release;
504                nfds = newf->max_fds;
505                read_lock(&oldf->file_lock);
506        }
507
508        old_fds = oldf->fd;
509        new_fds = newf->fd;
510
511        memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
512        memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
513
514        for (i = open_files; i != 0; i--) {
515                struct file *f = *old_fds++;
516                if (f) {
517                        get_file(f);
518                } else {
519                        /*
520                         * The fd may be claimed in the fd bitmap but not yet
521                         * instantiated in the files array if a sibling thread
522                         * is partway through open(). So make sure that this
523                         * fd is available to the new process.
524                         */
525                        FD_CLR(open_files - i, newf->open_fds);
526                }
527                *new_fds++ = f;
528        }
529        read_unlock(&oldf->file_lock);
530
531        /* compute the remainder to be cleared */
532        size = (newf->max_fds - open_files) * sizeof(struct file *);
533
534        /* This is long word aligned thus could use a optimized version */
535        memset(new_fds, 0, size);
536
537        if (newf->max_fdset > open_files) {
538                int left = (newf->max_fdset-open_files)/8;
539                int start = open_files / (8 * sizeof(unsigned long));
540
541                memset(&newf->open_fds->fds_bits[start], 0, left);
542                memset(&newf->close_on_exec->fds_bits[start], 0, left);
543        }
544
545        tsk->files = newf;
546        error = 0;
547out:
548        return error;
549
550out_release:
551        free_fdset (newf->close_on_exec, newf->max_fdset);
552        free_fdset (newf->open_fds, newf->max_fdset);
553        kmem_cache_free(files_cachep, newf);
554        goto out;
555}

411static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
412{
413        if (clone_flags & CLONE_FS) {
414                atomic_inc(&current->fs->count);
415                return 0;
416        }
417        tsk->fs = __copy_fs_struct(current->fs);
418        if (!tsk->fs)
419                return -1;
420        return 0;
421}

406struct fs_struct *copy_fs_struct(struct fs_struct *old)
407{
408        return __copy_fs_struct(old);
409}

381static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
382{
383        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
384        /* We don't need to lock fs - think why ;-) */
385        if (fs) {
386                atomic_set(&fs->count, 1);
387                fs->lock = RW_LOCK_UNLOCKED;
388                fs->umask = old->umask;
389                read_lock(&old->lock);
390                fs->rootmnt = mntget(old->rootmnt);
391                fs->root = dget(old->root);
392                fs->pwdmnt = mntget(old->pwdmnt);
393                fs->pwd = dget(old->pwd);
394                if (old->altroot) {
395                        fs->altrootmnt = mntget(old->altrootmnt);
396                        fs->altroot = dget(old->altroot);
397                } else {
398                        fs->altrootmnt = NULL;
399                        fs->altroot = NULL;
400                }
401                read_unlock(&old->lock);
402        }
403        return fs;
404}

584static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
585{
586        struct signal_struct *sig;
587
588        if (clone_flags & CLONE_SIGHAND) {
589                atomic_inc(&current->sig->count);
590                return 0;
591        }
592        sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
593        tsk->sig = sig;
594        if (!sig)
595                return -1;
596        spin_lock_init(&sig->siglock);
597        atomic_set(&sig->count, 1);
598        memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
599        return 0;
600}

318static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
319{
320        struct mm_struct * mm, *oldmm;
321        int retval;
322
323        tsk->min_flt = tsk->maj_flt = 0;
324        tsk->cmin_flt = tsk->cmaj_flt = 0;
325        tsk->nswap = tsk->cnswap = 0;
326
327        tsk->mm = NULL;
328        tsk->active_mm = NULL;
329
330        /*
331         * Are we cloning a kernel thread?
332         *
333         * We need to steal a active VM for that..
334         */
335        oldmm = current->mm;
336        if (!oldmm)
337                return 0;
338
339        if (clone_flags & CLONE_VM) {
340                atomic_inc(&oldmm->mm_users);
341                mm = oldmm;
342                goto good_mm;
343        }
344
345        retval = -ENOMEM;
346        mm = allocate_mm();
347        if (!mm)
348                goto fail_nomem;
349
350        /* Copy the current MM stuff.. */
351        memcpy(mm, oldmm, sizeof(*mm));
352        if (!mm_init(mm))
353                goto fail_nomem;
354
355        if (init_new_context(tsk,mm))
356                goto free_pt;
357
358        down_write(&oldmm->mmap_sem);
359        retval = dup_mmap(mm);
        深度复制
360        up_write(&oldmm->mmap_sem);
361
362        if (retval)
363                goto free_pt;
364
365        /*
366         * child gets a private LDT (if there was an LDT in the parent)
367         */
368        copy_segments(tsk, mm);
369
370good_mm:
371        tsk->mm = mm;
372        tsk->active_mm = mm;
373        return 0;
374
375free_pt:
376        mmput(mm);
377fail_nomem:
378        return retval;
379}

145static inline int dup_mmap(struct mm_struct * mm)
146{
147        struct vm_area_struct * mpnt, *tmp, **pprev;
148        int retval;
149
150        flush_cache_mm(current->mm);
151        mm->locked_vm = 0;
152        mm->mmap = NULL;
153        mm->mmap_cache = NULL;
154        mm->map_count = 0;
155        mm->rss = 0;
156        mm->cpu_vm_mask = 0;
157        mm->swap_address = 0;
158        pprev = &mm->mmap;
159
160        /*
161         * Add it to the mmlist after the parent.
162         * Doing it this way means that we can order the list,
163         * and fork() won't mess up the ordering significantly.
164         * Add it first so that swapoff can see any swap entries.
165         */
166        spin_lock(&mmlist_lock);
167        list_add(&mm->mmlist, &current->mm->mmlist);
168        mmlist_nr++;
169        spin_unlock(&mmlist_lock);
170
171        for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
172                struct file *file;
173
174                retval = -ENOMEM;
175                if(mpnt->vm_flags & VM_DONTCOPY)
176                        continue;
177                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
178                if (!tmp)
179                        goto fail_nomem;
180                *tmp = *mpnt;
181                tmp->vm_flags &= ~VM_LOCKED;
182                tmp->vm_mm = mm;
183                tmp->vm_next = NULL;
184                file = tmp->vm_file;
185                if (file) {
186                        struct inode *inode = file->f_dentry->d_inode;
187                        get_file(file);
188                        if (tmp->vm_flags & VM_DENYWRITE)
189                                atomic_dec(&inode->i_writecount);
190
191                        /* insert tmp into the share list, just after mpnt */
192                        spin_lock(&inode->i_mapping->i_shared_lock);
193                        if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
194                                mpnt->vm_next_share->vm_pprev_share =
195                                        &tmp->vm_next_share;
196                        mpnt->vm_next_share = tmp;
197                        tmp->vm_pprev_share = &mpnt->vm_next_share;
198                        spin_unlock(&inode->i_mapping->i_shared_lock);
199                }
200
201                /*
202                 * Link in the new vma and copy the page table entries:
203                 * link in first so that swapoff can see swap entries.
204                 */
205                spin_lock(&mm->page_table_lock);
206                *pprev = tmp;
207                pprev = &tmp->vm_next;
208                mm->map_count++;
209                retval = copy_page_range(mm, current->mm, tmp);
210                spin_unlock(&mm->page_table_lock);
211
212                if (tmp->vm_ops && tmp->vm_ops->open)
213                        tmp->vm_ops->open(tmp);
214
215                if (retval)
216                        goto fail_nomem;
217        }
218        retval = 0;
219        build_mmap_rb(mm);
220
221fail_nomem:
222        flush_tlb_mm(current->mm);
223        return retval;
224}
144/*
145 * copy one vm_area from one task to the other. Assumes the page tables
146 * already present in the new task to be cleared in the whole range
147 * covered by this vma.
148 *
149 * 08Jan98 Merged into one routine from several inline routines to reduce
150 *         variable count and make things faster. -jj
151 */
152int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
153                        struct vm_area_struct *vma)
154{
155        pgd_t * src_pgd, * dst_pgd;
156        unsigned long address = vma->vm_start;
157        unsigned long end = vma->vm_end;
158        unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
159
160        src_pgd = pgd_offset(src, address)-1;
161        dst_pgd = pgd_offset(dst, address)-1;
162
163        for (;;) {
        对页面目录项的循环
164                pmd_t * src_pmd, * dst_pmd;
165
166                src_pgd++; dst_pgd++;
167
168                /* copy_pmd_range */
169
170                if (pgd_none(*src_pgd))
171                        goto skip_copy_pmd_range;
172                if (pgd_bad(*src_pgd)) {
173                        pgd_ERROR(*src_pgd);
174                        pgd_clear(src_pgd);
175skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
176                        if (!address || (address >= end))
177                                goto out;
178                        continue;
179                }
180                if (pgd_none(*dst_pgd)) {
181                        if (!pmd_alloc(dst_pgd, 0))
182                                goto nomem;
183                }
184
185                src_pmd = pmd_offset(src_pgd, address);
186                dst_pmd = pmd_offset(dst_pgd, address);
187
188                do {
                对中间目录项的循环
189                        pte_t * src_pte, * dst_pte;
190
191                        /* copy_pte_range */
192
193                        if (pmd_none(*src_pmd))
194                                goto skip_copy_pte_range;
195                        if (pmd_bad(*src_pmd)) {
196                                pmd_ERROR(*src_pmd);
197                                pmd_clear(src_pmd);
198skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
199                                if (address >= end)
200                                        goto out;
201                                goto cont_copy_pmd_range;
202                        }
203                        if (pmd_none(*dst_pmd)) {
204                                if (!pte_alloc(dst_pmd, 0))
205                                        goto nomem;
206                        }
207
208                        src_pte = pte_offset(src_pmd, address);
209                        dst_pte = pte_offset(dst_pmd, address);
210
211                        do {
                    对页面表的循环
212                                pte_t pte = *src_pte;
213                                struct page *ptepage;
214
215                                /* copy_one_pte */
216
217                                if (pte_none(pte))
                        页面映射尚未建立
218                                        goto cont_copy_pte_range_noset;
219                                if (!pte_present(pte)) {
                        页面内容在交换设备上
220                                        swap_duplicate(pte_to_swp_entry(pte));
                                递增共享计数
221                                        goto cont_copy_pte_range;
222                                }
223                                ptepage = pte_page(pte);
224                                if ((!VALID_PAGE(ptepage)) ||
225                                    PageReserved(ptepage))
                            无效内存页面的处理，（外设接口卡，或内核保留的页面）
226                                        goto cont_copy_pte_range;
227
228                                /* If it's a COW mapping, write protect it both in the parent and the child */
229                                if (cow) {
230                                        ptep_set_wrprotect(src_pte);
                                将父进程的页面表项改成写保护
231                                        pte = *src_pte;
232                                }
233
234                                /* If it's a shared mapping, mark it clean in the child */
235                                if (vma->vm_flags & VM_SHARED)
236                                        pte = pte_mkclean(pte);
                                把已经改成写保护的表项设置到子进程的页面表中
237                                pte = pte_mkold(pte);
238                                get_page(ptepage);
239
240cont_copy_pte_range:            set_pte(dst_pte, pte);
241cont_copy_pte_range_noset:      address += PAGE_SIZE;
242                                if (address >= end)
243                                        goto out;
244                                src_pte++;
245                                dst_pte++;
246                        } while ((unsigned long)src_pte & PTE_TABLE_MASK);
247
248cont_copy_pmd_range:    src_pmd++;
249                        dst_pmd++;
250                } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
251        }
252out:
253        return 0;
254
255nomem:
256        return -ENOMEM;
257}

499/*
500 * we do not have to muck with descriptors here, that is
501 * done in switch_mm() as needed.
502 */
503void copy_segments(struct task_struct *p, struct mm_struct *new_mm)
504{
505        struct mm_struct * old_mm;
506        void *old_ldt, *ldt;
507
508        ldt = NULL;
509        old_mm = current->mm;
510        if (old_mm && (old_ldt = old_mm->context.segments) != NULL) {
511                /*
512                 * Completely new LDT, we initialize it from the parent:
513                 */
514                ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE);
515                if (!ldt)
516                        printk(KERN_WARNING "ldt allocation failed/n");
517                else
518                        memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE);
519        }
520        new_mm->context.segments = ldt;
521}

529int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
530        unsigned long unused,
531        struct task_struct * p, struct pt_regs * regs)
532{
533        struct pt_regs * childregs;
534
535        childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
    获得子进程系统空间堆栈中pt_regs结构的指针
536        struct_cpy(childregs, regs);
537        childregs->eax = 0;
    设置子进程返回值
538        childregs->esp = esp;
    设置进程在用户空间的堆栈位置
539
540        p->thread.esp = (unsigned long) childregs;
    调整子进程系统空间堆栈中pt_regs结构的起始地址
541        p->thread.esp0 = (unsigned long) (childregs+1);
    调整子进程系统空间堆栈的栈顶
542
543        p->thread.eip = (unsigned long) ret_from_fork;
    进程下一次被切换进入运行时的切入点，设置成 ret_from_fork，使得子进程在首次被调度运行时就从这儿开始
544
545        savesegment(fs,p->thread.fs);
    把当前段寄存器fs的值保存在p->thread.fs中
546        savesegment(gs,p->thread.gs);
547
548        unlazy_fpu(current);
549        struct_cpy(&p->thread.i387, &current->thread.i387);
550
551        return 0;
552}