linux内核笔记二进程管理-优快云博客

本文详细解读Linux系统进程的运转方式，涉及jiffies系统滴答、进程创建过程、进程调度算法、定时器中断、进程通信与退出机制，重点剖析了创建新进程、时间片分配和调度策略。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

三、系统的进程管理

1、系统的进程运转方式

系统时间（jiffies 系统滴答）

cpu内部有一个RTC（系统的定时器），会在上电的时候调用mktime函数算出1970年一月一日0时开始到当前开机点所过的秒数。

mktime.c

/*
 *  linux/kernel/mktime.c
 *
 *  (C) 1991  Linus Torvalds
 */

#include <time.h>

/*
 * This isn't the library routine, it is only used in the kernel.
 * as such, we don't care about years<1970 etc, but assume everything
 * is ok. Similarly, TZ etc is happily ignored. We just do everything
 * as easily as possible. Let's find something public for the library
 * routines (although I think minix times is public).
 */
/*
 * PS. I hate whoever though up the year 1970 - couldn't they have gotten
 * a leap-year instead? I also hate Gregorius, pope or no. I'm grumpy.
 */
#define MINUTE 60
#define HOUR (60*MINUTE)
#define DAY (24*HOUR)
#define YEAR (365*DAY)

/* interestingly, we assume leap-years */
static int month[12] = {
	0,
	DAY*(31),
	DAY*(31+29),
	DAY*(31+29+31),
	DAY*(31+29+31+30),
	DAY*(31+29+31+30+31),
	DAY*(31+29+31+30+31+30),
	DAY*(31+29+31+30+31+30+31),
	DAY*(31+29+31+30+31+30+31+31),
	DAY*(31+29+31+30+31+30+31+31+30),
	DAY*(31+29+31+30+31+30+31+31+30+31),
	DAY*(31+29+31+30+31+30+31+31+30+31+30)
};

long kernel_mktime(struct tm * tm)
{
	long res;
	int year;

	year = tm->tm_year - 70;
/* magic offsets (y+1) needed to get leapyears right.*/
	res = YEAR*year + DAY*((year+1)/4);
	res += month[tm->tm_mon];
/* and (y+2) here. If it wasn't a leap-year, we have to adjust */
	if (tm->tm_mon>1 && ((year+2)%4))
		res -= DAY;
	res += DAY*(tm->tm_mday-1);
	res += HOUR*tm->tm_hour;
	res += MINUTE*tm->tm_min;
	res += tm->tm_sec;
	return res;
}

首先是四个宏定义，然后用调用kernal_mktime，会看到传了一个time，这个time参数就是一个时间结构体，赋值是由初始化时间从RTC(CMOS)读出来。转化为时间存入全局变量中，并且为jiffies所用。

jiffies一个系统滴答每隔10ms会引发一个定时器中断
中断服务函数中首先进行了jiffies的自加。这个中断叫timer_interrupt 在sys_call.s。
首先是栈的保存，然后修改一些寄存器的值，然后自加jiffies，然后就call_do_timer。

_timer_interrupt:
	push %ds		# save ds,es and put kernel data space
	push %es		# into them. %fs is used by _system_call
	push %fs
	pushl %edx		# we save %eax,%ecx,%edx as gcc doesn't
	pushl %ecx		# save those across function calls. %ebx
	pushl %ebx		# is saved as we use that in ret_sys_call
	pushl %eax
	movl $0x10,%eax
	mov %ax,%ds
	mov %ax,%es
	movl $0x17,%eax
	mov %ax,%fs
	incl _jiffies
	movb $0x20,%al		# EOI to interrupt controller #1
	outb %al,$0x20
	movl CS(%esp),%eax
	andl $3,%eax		# %eax is CPL (0 or 3, 0=supervisor)
	pushl %eax
	call _do_timer		# 'do_timer(long CPL)' does everything from
	addl $4,%esp		# task switching to accounting ...
	jmp ret_from_sys_call

上面的最后调用了 _do_timer
这个函数在sched.c

void do_timer(long cpl)
{
	extern int beepcount;
	extern void sysbeepstop(void);

	if (beepcount)
		if (!--beepcount)
			sysbeepstop();

	if (cpl)
		current->utime++;
	else
		current->stime++;

	if (next_timer) {
		next_timer->jiffies--;
		while (next_timer && next_timer->jiffies <= 0) {
			void (*fn)(void);
			
			fn = next_timer->fn;
			next_timer->fn = NULL;
			next_timer = next_timer->next;
			(fn)();
		}
	}
	if (current_DOR & 0xf0)
		do_floppy_timer();
	if ((--current->counter)>0) return;
	current->counter=0;
	if (!cpl) return;
	schedule();
}

if (cpl)
current->utime++;
else
current->stime++;

cpl表示当前被中断的程序的特权。0表示内核进程。1表示用户进程。
进程都是在跟task_struck对象，比如一个进程的创建，就是建立一个这个对象。
current就是task_struck的一个实例。
utime用户程序的运行时间。
stime内核程序的运行时间。

if (next_timer) {
next_timer->jiffies–;
while (next_timer && next_timer->jiffies <= 0) {
void (*fn)(void);
fn = next_timer->fn;
next_timer->fn = NULL;
next_timer = next_timer->next;
(fn)();
}
}

next_time是一个时间链表的指针。是嫁接与jiffies这个变量的所有定时器的时间链表。
查看定时器链表，如果时间等于0就调用中断函数。

current->counter --进程的时间片。
标志着当前进程还能运行多长时间。
tack_struck[]时间向量表 counter 时间片
counter—在哪里用进程的调度就是task_struck[]进程链表的检索，找时间片最大的那个进程对象，然后进行调用，知道时间片为0，就退出，之后再进行新一轮的调用。

counter—在哪里设置当全部的task_struck[]如果所有的进程的counter都为0，就是进程都运行完了，就进行新一轮的counter分配。（优先级分配）

(*p)->counter = ((*p)->counter >> 1) +(*p)->priority

我们0.1.1 优先级时间片轮转调度算法。

2、如何进行创建一个新的进程

进程封装成了一个结构体。

struct task_struct {
/* these are hardcoded - don't touch */
	long state;	/* -1 unrunnable, 0 runnable, >0 stopped  进程的装态*/
	long counter; /*时间片的计数值*/
	long priority; /*优先级*/
	long signal;
	struct sigaction sigaction[32];
	long blocked;	/* bitmap of masked signals */
/* various fields */
	int exit_code;
	unsigned long start_code,end_code,end_data,brk,start_stack;
	long pid,father,pgrp,session,leader;
	unsigned short uid,euid,suid;
	unsigned short gid,egid,sgid;
	long alarm;
	long utime,stime,cutime,cstime,start_time;
	unsigned short used_math;
/* file system info */
	int tty;		/* -1 if no tty, so it must be signed */
	unsigned short umask;
	struct m_inode * pwd;
	struct m_inode * root;
	struct m_inode * executable;
	unsigned long close_on_exec;
	struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
	struct desc_struct ldt[3];
/* tss for this task */
	struct tss_struct tss;
};

state; -1 unrunnable, 0 runnable, >0 stopped 进程的装态
counter; 时间片的计数值
priority 优先级
counter = counter/2+priority

有一个进程链表，task。检索链表，查看counter。

每个进程都有LDT局部描述符，还有一个TSS进程的状态描述符。
LDT里面会有进程的数据段，跟进程的代码段。
TSS 在进程运行的过程,cpu需要知道的进程的状态标识。

分时技术进行多进程调度。

重点：进程的创建是如何的？

linux在初始化的过程中会进行0号进程的创建。
main函数。

void main(void)		/* This really IS void, no error here. */
{			/* The startup routine assumes (well, ...) this */
/*
 * Interrupts are still disabled. Do necessary setups, then
 * enable them
 */
 	ROOT_DEV = ORIG_ROOT_DEV;
 	drive_info = DRIVE_INFO;
	memory_end = (1<<20) + (EXT_MEM_K<<10);
	memory_end &= 0xfffff000;
	if (memory_end > 16*1024*1024)
		memory_end = 16*1024*1024;
	if (memory_end > 12*1024*1024) 
		buffer_memory_end = 4*1024*1024;
	else if (memory_end > 6*1024*1024)
		buffer_memory_end = 2*1024*1024;
	else
		buffer_memory_end = 1*1024*1024;
	main_memory_start = buffer_memory_end;
#ifdef RAMDISK
	main_memory_start += rd_init(main_memory_start, RAMDISK*1024);
#endif
	//进行内存控制器的初始化加载内存驱动
	mem_init(main_memory_start,memory_end);
	//异常函数的初始化
	trap_init();
	//进行块设备驱动的初始化，加载块设备驱动
	blk_dev_init();
	//进行字符型设备驱动的初始化，加载字符型设备驱动
	chr_dev_init();
	//进行控制台设备的初始化，加载显示和传输设备的驱动
	tty_init();
	//加载定时器驱动
	time_init();
	//进行进程调度的初始化
	sched_init();
	//进行缓冲区初始化
	buffer_init(buffer_memory_end);
	//进行硬盘设备的初始化，加载硬盘驱动
	hd_init();
	//进行软盘设备的初始化，加载软盘驱动
	floppy_init();
	sti();
	//从内核的初始化状态切换到用户模式。
	move_to_user_mode();
	//创建0号进程运行最初的应用软件
	if (!fork()) {		/* we count on this going ok */
		init();
	}
/*
 *   NOTE!!   For any other task 'pause()' would mean we have to get a
 * signal to awaken, but task0 is the sole exception (see 'schedule()')
 * as task 0 gets activated at every idle moment (when no other tasks
 * can run). For task0 'pause()' just means we go check if some other
 * task can run, and if not we return here.
 */
	for(;;) pause();
}

刚开始一些东西是初始化linux，然后就是初始化一些什么异常初始化，驱动初始化，时间初始化等等。
我们现在要分析的是进程调度初始化。

void sched_init(void)
{
	int i;
	struct desc_struct * p;

	if (sizeof(struct sigaction) != 16)
		panic("Struct sigaction MUST be 16 bytes");
	set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
	set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
	p = gdt+2+FIRST_TSS_ENTRY;
	for(i=1;i<NR_TASKS;i++) {
		task[i] = NULL;
		p->a=p->b=0;
		p++;
		p->a=p->b=0;
		p++;
	}
/* Clear NT, so that we won't have troubles with that later on */
	__asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
	ltr(0);
	lldt(0);
	outb_p(0x36,0x43);		/* binary, mode 3, LSB/MSB, ch 0 */
	outb_p(LATCH & 0xff , 0x40);	/* LSB */
	outb(LATCH >> 8 , 0x40);	/* MSB */
	set_intr_gate(0x20,&timer_interrupt);
	outb(inb_p(0x21)&~0x01,0x21);
	set_system_gate(0x80,&system_call);
}

系统级别GDT 描述符。
代码先是复制信息到GDT，然后把task链表清空。
task链表就是进程链表。
然后进入汇编代码阶段，都是在初始化一些寄存器。
最后会发现有个这玩意

set_system_gate(0x80,&system_call);

所有人都能用的一个中断，那么这个中断是一个系统调用。

linux在初始化的过程中会进行0号进程的创建。
在move_to_user_mode()这个函数，下面fork已经是在创建1号进程了。
对于0号进程的软件抽象实体已经通过静态的方式定义好了，那如何让这个实体运行呢？回想一下，进程的软件抽象是由描述进程相关的成员及执行时的栈空间组成，对于进程描述符描述的是进程的相关静态属性，而栈空间是进程执行时动态空间。因为在进程0初次执行时，对于进程的调度程序其实还没初始化，所以进程0的首次运行肯定不是通过内核中的调度程序加载的，而是简单的通过将栈寄存器(esp)指向进程0的栈底位置，从而表示了在接下来的程序执行是在进程0的栈空间中，也就是进程0在执行了。

fork下面做了个init工作。
init是干了点啥。

void init(void)
{
	int pid,i;

	setup((void *) &drive_info);
	(void) open("/dev/tty0",O_RDWR,0);
	(void) dup(0);
	(void) dup(0);
	//打开了三个控制台，标准输入输出错误
	printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
		NR_BUFFERS*BLOCK_SIZE);
	printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);
	if (!(pid=fork())) { //下面就是一个fork
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);
	}
	if (pid>0)
		while (pid != wait(&i))
			/* nothing */;
	while (1) {
		if ((pid=fork())<0) {
			printf("Fork failed in init\r\n");
			continue;
		}
		if (!pid) {
			close(0);close(1);close(2);
			setsid();
			(void) open("/dev/tty0",O_RDWR,0);
			(void) dup(0);
			(void) dup(0);
			_exit(execve("/bin/sh",argv,envp));
		}
		while (1)
			if (pid == wait(&i))
				break;
		printf("\n\rchild %d died with code %04x\n\r",pid,i);
		sync();
	}
	_exit(0);	/* NOTE! _exit, not exit() */
}

进程初始化：
0号进程
1、设置了一些驱动信息，打开了一个tty0文件，打开标准输入控制台。
又打开了标准输出控制台跟标准错误控制台。
2、创建1号进程，如果创建成功，则在一号进程中打开一个文件/etc/rc，然后执行shell程序。etc在linux里面是配置文件，rc文件呢就是它会读取这个文件，然后执行这个里面所有的命令。
3、下面的while嵌套就是说用另外一种方式实现上面的工作。
4、0号进程不可能结束，他会在没有其他进程调用的时候调用，只会执行for(;😉 pause();

进程的创建

fork
1、在task链表中找一个进程空位存档当前的进程
2、创建一个task_struct
3、设置task_struct

fork.c

/*
 *  linux/kernel/fork.c
 *
 *  (C) 1991  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also system_call.s), and some misc functions ('verify_area').
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
 */
#include <errno.h>

#include <linux/sched.h>
#include <linux/kernel.h>
#include <asm/segment.h>
#include <asm/system.h>

extern void write_verify(unsigned long address);

long last_pid=0;

void verify_area(void * addr,int size)
{
	unsigned long start;

	start = (unsigned long) addr;
	size += start & 0xfff;
	start &= 0xfffff000;
	start += get_base(current->ldt[2]);
	while (size>0) {
		size -= 4096;
		write_verify(start);
		start += 4096;
	}
}

int copy_mem(int nr,struct task_struct * p)
{
	unsigned long old_data_base,new_data_base,data_limit;
	unsigned long old_code_base,new_code_base,code_limit;

	code_limit=get_limit(0x0f);
	data_limit=get_limit(0x17);
	old_code_base = get_base(current->ldt[1]);
	old_data_base = get_base(current->ldt[2]);
	if (old_data_base != old_code_base)
		panic("We don't support separate I&D");
	if (data_limit < code_limit)
		panic("Bad data_limit");
	new_data_base = new_code_base = nr * 0x4000000;
	p->start_code = new_code_base;
	set_base(p->ldt[1],new_code_base);
	set_base(p->ldt[2],new_data_base);
	if (copy_page_tables(old_data_base,new_data_base,data_limit)) {
		free_page_tables(new_data_base,data_limit);
		return -ENOMEM;
	}
	return 0;
}

/*
 *  Ok, this is the main fork-routine. It copies the system process
 * information (task[nr]) and sets up the necessary registers. It
 * also copies the data segment in it's entirety.
 */
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
		long ebx,long ecx,long edx,
		long fs,long es,long ds,
		long eip,long cs,long eflags,long esp,long ss)
{
	struct task_struct *p;
	int i;
	struct file *f;

	p = (struct task_struct *) get_free_page();
	if (!p)
		return -EAGAIN;
	task[nr] = p;
	*p = *current;	/* NOTE! this doesn't copy the supervisor stack */
	p->state = TASK_UNINTERRUPTIBLE;
	p->pid = last_pid;
	p->father = current->pid;
	p->counter = p->priority;
	p->signal = 0;
	p->alarm = 0;
	p->leader = 0;		/* process leadership doesn't inherit */
	p->utime = p->stime = 0;
	p->cutime = p->cstime = 0;
	p->start_time = jiffies;
	p->tss.back_link = 0;
	p->tss.esp0 = PAGE_SIZE + (long) p;
	p->tss.ss0 = 0x10;
	p->tss.eip = eip;
	p->tss.eflags = eflags;
	p->tss.eax = 0;
	p->tss.ecx = ecx;
	p->tss.edx = edx;
	p->tss.ebx = ebx;
	p->tss.esp = esp;
	p->tss.ebp = ebp;
	p->tss.esi = esi;
	p->tss.edi = edi;
	p->tss.es = es & 0xffff;
	p->tss.cs = cs & 0xffff;
	p->tss.ss = ss & 0xffff;
	p->tss.ds = ds & 0xffff;
	p->tss.fs = fs & 0xffff;
	p->tss.gs = gs & 0xffff;
	p->tss.ldt = _LDT(nr);
	p->tss.trace_bitmap = 0x80000000;
	if (last_task_used_math == current)
		__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
	if (copy_mem(nr,p)) {
		task[nr] = NULL;
		free_page((long) p);
		return -EAGAIN;
	}
	for (i=0; i<NR_OPEN;i++)
		if (f=p->filp[i])
			f->f_count++;
	if (current->pwd)
		current->pwd->i_count++;
	if (current->root)
		current->root->i_count++;
	if (current->executable)
		current->executable->i_count++;
	set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
	set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
	p->state = TASK_RUNNING;	/* do this last, just in case */
	return last_pid;
}

int find_empty_process(void)
{
	int i;

	repeat:
		if ((++last_pid)<0) last_pid=1;
		for(i=0 ; i<NR_TASKS ; i++)
			if (task[i] && task[i]->pid == last_pid) goto repeat;
	for(i=1 ; i<NR_TASKS ; i++)
		if (!task[i])
			return i;
	return -EAGAIN;
}

进程的创建是系统调用。
进程的创建就是对0号进程或者当前进程的复制。（0号进程复制就是结构体的复制 task0对应的task_struct赋值给新建的task_struct，对于栈堆的拷贝就是当进程做创建的时候要复制原有的栈堆）

1、给当前要创建的一个进程分配一个进程号

验证区域
拷贝内存
拷贝程序
找到一个空程序。

我们找到它的系统调用

.align 2
_sys_fork:
	call _find_empty_process
	testl %eax,%eax
	js 1f
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	pushl %eax
	call _copy_process
	addl $20,%esp
1:	ret

会干嘛会首先call 哪个find empty。

int find_empty_process(void)
{
	int i;

	repeat:
		if ((++last_pid)<0) last_pid=1;
		for(i=0 ; i<NR_TASKS ; i++)
			if (task[i] && task[i]->pid == last_pid) goto repeat;
	for(i=1 ; i<NR_TASKS ; i++)
		if (!task[i])
			return i;
	return -EAGAIN;
}

先进来检索，然后找到一个task为空的地方。

2、进程的创建主体

int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
		long ebx,long ecx,long edx,
		long fs,long es,long ds,
		long eip,long cs,long eflags,long esp,long ss)
{
	struct task_struct *p;
	int i;
	struct file *f;

	p = (struct task_struct *) get_free_page();
	if (!p)
		return -EAGAIN;
	task[nr] = p;
	*p = *current;	/* NOTE! this doesn't copy the supervisor stack */
	p->state = TASK_UNINTERRUPTIBLE;
	p->pid = last_pid;
	p->father = current->pid;
	p->counter = p->priority;
	p->signal = 0;
	p->alarm = 0;
	p->leader = 0;		/* process leadership doesn't inherit */
	p->utime = p->stime = 0;
	p->cutime = p->cstime = 0;
	p->start_time = jiffies;
	p->tss.back_link = 0;
	p->tss.esp0 = PAGE_SIZE + (long) p;
	p->tss.ss0 = 0x10;
	p->tss.eip = eip;
	p->tss.eflags = eflags;
	p->tss.eax = 0;
	p->tss.ecx = ecx;
	p->tss.edx = edx;
	p->tss.ebx = ebx;
	p->tss.esp = esp;
	p->tss.ebp = ebp;
	p->tss.esi = esi;
	p->tss.edi = edi;
	p->tss.es = es & 0xffff;
	p->tss.cs = cs & 0xffff;
	p->tss.ss = ss & 0xffff;
	p->tss.ds = ds & 0xffff;
	p->tss.fs = fs & 0xffff;
	p->tss.gs = gs & 0xffff;
	p->tss.ldt = _LDT(nr);
	p->tss.trace_bitmap = 0x80000000;
	if (last_task_used_math == current)
		__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
	if (copy_mem(nr,p)) {
		task[nr] = NULL;
		free_page((long) p);
		return -EAGAIN;
	}
	for (i=0; i<NR_OPEN;i++)
		if (f=p->filp[i])
			f->f_count++;
	if (current->pwd)
		current->pwd->i_count++;
	if (current->root)
		current->root->i_count++;
	if (current->executable)
		current->executable->i_count++;
	set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
	set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
	p->state = TASK_RUNNING;	/* do this last, just in case */
	return last_pid;
}

参数nr就是要复制的进程号，后面都是通用寄存器的值。
因为要设置task_struct,然后里面又有tss，tss里面会有各种寄存器，所以我们需要这个参数。

struct task_struct *p;
p = (struct task_struct *) get_free_page();

1、首先分配一个一个进程指针，然后申请了一个页。
2、创建一个子进程的task_struct结构体
3、将当前的子进程放入到整体的进程的列表里面
4、然后大范围的设置创建的task_struct结构体
状态先设置成不可被中断的状态
第一次的时间片是优先级，因为第一次counter是0嘛
5、如果当前进程使用了协处理器，那就设置当前创建进程的协处理器
6、copy_process调用了copy_mem，申请了内存，进行老进程向新进程的代码段、数据段（LDT）的拷贝
7、如果父进程打开了某个某些文件，那么子进程也同样打开这些文件，所以将文件的打开计数加1.
8、设置进程两个段，结合拷贝过来堆栈，然后组成进程。
9、状态改过来，设置成可运行。
10、返回进程号。

3、进程调度

进程调度函数
void schedule进程调度函数。
里面还有函数 switch_to函数进程切换函数

进程状态：运行状态：可以被运行就绪状态进程的切换只有在运行状态才可以
可中断睡眠状态：可以被信号中断使其变成running
不可中断睡眠状态：只能被wakeup所唤醒变为running （sleep过的就是这种）
暂停状态：收到SIGSTOP SIGSTP SIFTTIN
僵死状态：进程已经停止运行，但是父进程还没有将它清空。

通过信号给进程发消息。

sched.c
辅助函数
show_task 打印p->pid state打印栈堆空闲大小
math_state_restore 协处理器协处理器也是一种处理器，所以也会有tss这种。

重点来了 schedule

/*
 *  'schedule()' is the scheduler function. This is GOOD CODE! There
 * probably won't be any reason to change this, as it should work well
 * in all circumstances (ie gives IO-bound processes good response etc).
 * The one thing you might take a look at is the signal-handler code here.
 *
 *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 * information in task[0] is never used.
 */
void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->alarm && (*p)->alarm < jiffies) {
					(*p)->signal |= (1<<(SIGALRM-1));
					(*p)->alarm = 0;
				}
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

进程的调度也是一种系统调用。

创建了结构体的指针

1、
warning，就是闹钟，就在这里通过时间滴答来响应

for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
	if (*p) {
		if ((*p)->alarm && (*p)->alarm < jiffies) {
				(*p)->signal |= (1<<(SIGALRM-1));
				(*p)->alarm = 0;
			}
		if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&  //去除信号不为空，并且去除不能引发进程就绪状态的阻塞信号。
		(*p)->state==TASK_INTERRUPTIBLE)
		(*p)->state=TASK_RUNNING;
}

task链表，每一项都是task，最大64个。
如果该进程为可中断状态，则如果该进程有非屏蔽状态

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		//时间片的重分配。
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}

首先会先把没有时间片的先不管它。
下面这个就是对counter的一个比较，谁大就先调度谁。就是一个简单的查询。
当前所有的时间都是0，所有进程的时间片都救赎了，就进行时间片的重分配。

最后是一个switch_to。
switch_to(next)。

然后我们去研究设个seitch_to函数是干啥的。
就是单纯的进程切换

#define switch_to(n) {\
struct {long a,b;} __tmp; \
__asm__("cmpl %%ecx,_current\n\t" \
	"je 1f\n\t" \
	"movw %%dx,%1\n\t" \
	"xchgl %%ecx,_current\n\t" \
	"ljmp %0\n\t" \
	"cmpl %%ecx,_last_task_used_math\n\t" \
	"jne 1f\n\t" \
	"clts\n" \
	"1:" \
	::"m" (*&__tmp.a),"m" (*&__tmp.b), \
	"d" (_TSS(n)),"c" ((long) task[n])); \
}

直接就是一段汇编宏定义。就是效率高。
1、首先比较要切换的进程是不是当前进程
2、把指针赋值给当前任务的指针 current。
3、进行进程的上下文切换
什么叫做上下文程序运行时 CPU的特殊寄存器，通用寄存器 tss等信息 + 当前堆栈信息。

在这里插入图片描述
再顺路看看sleepon
当一个进程要去访问CPU资源的时候，碰巧CPU资源被占用，那么就会调用CPU函数，就会把进程休眠。

void sleep_on(struct task_struct **p)
{
	struct task_struct *tmp;

	if (!p)
		return;
	if (current == &(init_task.task))
		panic("task[0] trying to sleep");
	tmp = *p;
	*p = current;
	current->state = TASK_UNINTERRUPTIBLE;
	schedule();
	if (tmp)
		tmp->state=0;
}

如果当前进程是0号，那就打印，返回。
如果不是呢，就把当前进程变成不可中断，然后调用进程调度函数。
通过进程调度函数，就实现了p为头，tmp为下一个指针的一个指针链表来管理等待任务。
就是通过那个tmp p current来实现的。
用wakeup唤醒。

4、进程的退出

linux命名规则
syscall_
do_
一般都是会被调用的函数。

exit是销毁函数，当一个进程调用销毁函数的时候
exit是一个系统调用，调用的是do_exit函数。
这个系统调用会干啥。
1、首先是会释放进程的代码段和数据段占用的内存
2、关闭进程打开的所有文件，对当前的目录和i节点进行同步（文件操作）
3、如果当前要销毁的进程有子进程，那么就让第1号进程作为新的父进程（init进程）
4、如果当前进程是一个会话头进程，则会终止会话中的所有进程。
5、改变当前进程的运行状态，变成TASK_ZOMBIE僵死状态。并且向其父进程发送SIGCHLD信号。

1、对于进程从tack中移除是由其父进程干的，父进程在运行子进程的时候一般都会运行wait waitpid函数（父进程等待某个子进程终止的）
当父进程收到SIGCHLD信号时，父进程会终止僵死状态的子进程。
2、首先父进程会把子进程的时间累加到自己的运行时间当中。
3、把对应的子进程的就能成描述结构体进行释放，滞空任务数组中的空槽

在这里插入图片描述

内核销毁代码
exit.c

/*
 *  linux/kernel/exit.c
 *
 *  (C) 1991  Linus Torvalds
 */

#include <errno.h>
#include <signal.h>
#include <sys/wait.h>

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/tty.h>
#include <asm/segment.h>

int sys_pause(void);
int sys_close(int fd);

void release(struct task_struct * p)
{
	int i;

	if (!p)
		return;
	for (i=1 ; i<NR_TASKS ; i++)
		if (task[i]==p) {
			task[i]=NULL;
			free_page((long)p);
			schedule();
			return;
		}
	panic("trying to release non-existent task");
}

static inline int send_sig(long sig,struct task_struct * p,int priv)
{
	if (!p || sig<1 || sig>32)
		return -EINVAL;
	if (priv || (current->euid==p->euid) || suser())
		p->signal |= (1<<(sig-1));
	else
		return -EPERM;
	return 0;
}

static void kill_session(void)
{
	struct task_struct **p = NR_TASKS + task;
	
	while (--p > &FIRST_TASK) {
		if (*p && (*p)->session == current->session)
			(*p)->signal |= 1<<(SIGHUP-1);
	}
}

/*
 * XXX need to check permissions needed to send signals to process
 * groups, etc. etc.  kill() permissions semantics are tricky!
 */
int sys_kill(int pid,int sig)
{
	struct task_struct **p = NR_TASKS + task;
	int err, retval = 0;

	if (!pid) while (--p > &FIRST_TASK) {
		if (*p && (*p)->pgrp == current->pid) 
			if (err=send_sig(sig,*p,1))
				retval = err;
	} else if (pid>0) while (--p > &FIRST_TASK) {
		if (*p && (*p)->pid == pid) 
			if (err=send_sig(sig,*p,0))
				retval = err;
	} else if (pid == -1) while (--p > &FIRST_TASK)
		if (err = send_sig(sig,*p,0))
			retval = err;
	else while (--p > &FIRST_TASK)
		if (*p && (*p)->pgrp == -pid)
			if (err = send_sig(sig,*p,0))
				retval = err;
	return retval;
}

static void tell_father(int pid)
{
	int i;

	if (pid)
		for (i=0;i<NR_TASKS;i++) {
			if (!task[i])
				continue;
			if (task[i]->pid != pid)
				continue;
			task[i]->signal |= (1<<(SIGCHLD-1));
			return;
		}
/* if we don't find any fathers, we just release ourselves */
/* This is not really OK. Must change it to make father 1 */
	printk("BAD BAD - no father found\n\r");
	release(current);
}

int do_exit(long code)
{
	int i;

	free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));
	free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
	for (i=0 ; i<NR_TASKS ; i++)
		if (task[i] && task[i]->father == current->pid) {
			task[i]->father = 1;
			if (task[i]->state == TASK_ZOMBIE)
				/* assumption task[1] is always init */
				(void) send_sig(SIGCHLD, task[1], 1);
		}
	for (i=0 ; i<NR_OPEN ; i++)
		if (current->filp[i])
			sys_close(i);
	iput(current->pwd);
	current->pwd=NULL;
	iput(current->root);
	current->root=NULL;
	iput(current->executable);
	current->executable=NULL;
	if (current->leader && current->tty >= 0)
		tty_table[current->tty].pgrp = 0;
	if (last_task_used_math == current)
		last_task_used_math = NULL;
	if (current->leader)
		kill_session();
	current->state = TASK_ZOMBIE;
	current->exit_code = code;
	tell_father(current->father);
	schedule();
	return (-1);	/* just to suppress warnings */
}

int sys_exit(int error_code)
{
	return do_exit((error_code&0xff)<<8);
}

int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) {
			if ((*p)->pid != pid)
				continue;
		} else if (!pid) {
			if ((*p)->pgrp != current->pgrp)
				continue;
		} else if (pid != -1) {
			if ((*p)->pgrp != -pid)
				continue;
		}
		switch ((*p)->state) {
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;
				code = (*p)->exit_code;
				release(*p);
				put_fs_long(code,stat_addr);
				return flag;
			default:
				flag=1;
				continue;
		}
	}
	if (flag) {
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;
		schedule();
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))
			goto repeat;
		else
			return -EINTR;
	}
	return -ECHILD;
}

先来看一下这个realease

void release(struct task_struct * p)

完成了清空了任务描述表中的对应进程表项，释放对应的内存页（代码段、数据段、）

5、进程的通信

send_sig

static inline int send_sig(long sig,struct task_struct * p,int priv)

就是发送信号用的。
给指定的p进程发送对应的sig信号

static void kill_session(void)

我们常见的 Linux session 一般是指 shell session。Shell session 是终端中当前的状态，在终端中只能有一个 session。当我们打开一个新的终端时，总会创建一个新的 shell session。
就进程间的关系来说，session 由一个或多个进程组组成。一般情况下，来自单个登录的所有进程都属于同一个 session

会话session的一个概念
在这里插入图片描述
更多的在这里

从任务数组最后一个开始扫描，并没有包括0号进程。
终止会话，向其发送SIGHUP

int sys_kill(int pid,int sig)

向对应的进程号或者进程组号发送任何信号。
pid （pid>0 给对应的pid发送sig
pid = 0，给对应的sid发送sig
pid = -1，给任何进程发送sig
pid < -1，给进程组号为-pid的进程组发送信号）

static void tell_father(int pid)

子进程向父进程发送信号，并且释放子进程。
我们说进程没得时候，就是进入僵死状态时候会通知父进程，就是通过这个函数干的。

int do_exit(long code)

做的就是上面的6条
1、首先是会释放进程的代码段和数据段占用的内存

free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));
free_page_tables(get_base(current->ldt[2]),get_limit(0x17))

2、关闭进程打开的所有文件，对当前的目录和i节点进行同步（文件操作）

for (i=0 ; i<NR_OPEN ; i++)
		if (current->filp[i])
			sys_close(i);
	iput(current->pwd);
	current->pwd=NULL;
	iput(current->root);
	current->root=NULL;
	iput(current->executable);
	current->executable=NULL;

3、如果当前要销毁的进程有子进程，那么就让第1号进程作为新的父进程（init进程）

for (i=0 ; i<NR_TASKS ; i++)
		if (task[i] && task[i]->father == current->pid) {
			task[i]->father = 1;
			if (task[i]->state == TASK_ZOMBIE)
				/* assumption task[1] is always init */
				(void) send_sig(SIGCHLD, task[1], 1);
		}
	if (current->leader && current->tty >= 0)
		tty_table[current->tty].pgrp = 0;     //tty 控制台   关闭控制台
	if (last_task_used_math == current)
		last_task_used_math = NULL;     //关闭协处理器

4、如果当前进程是一个会话头进程，则会终止会话中的所有进程。

if (current->leader)
		kill_session();

5、改变当前进程的运行状态，变成TASK_ZOMBIE僵死状态。并且向其父进程发送SIGCHLD信号。

current->state = TASK_ZOMBIE;
	current->exit_code = code;
	tell_father(current->father);

6、重新调度进程

	schedule();

注意到有个waitpid函数，我们之前说这个函数会在父进程等待子进程中用到

int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)

这个函数会对上上面说过的

1、对于进程从tack中移除是由其父进程干的，父进程在运行子进程的时候一般都会运行wait
waitpid函数（父进程等待某个子进程终止的）当父进程收到SIGCHLD信号时，父进程会终止僵死状态的子进程。
2、首先父进程会把子进程的时间累加到自己的运行时间当中。 3、把对应的子进程的就能成描述结构体进行释放，滞空任务数组中的空槽

这三条

int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) {
			if ((*p)->pid != pid)
				continue;
		} else if (!pid) {
			if ((*p)->pgrp != current->pgrp)
				continue;
		} else if (pid != -1) {
			if ((*p)->pgrp != -pid)
				continue;
		}
		switch ((*p)->state) {
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;
				code = (*p)->exit_code;
				release(*p);
				put_fs_long(code,stat_addr);
				return flag;
			default:
				flag=1;
				continue;
		}
	}
	if (flag) {
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;
		schedule();
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))
			goto repeat;
		else
			return -EINTR;
	}
	return -ECHILD;
}