在讲通用块层IO调度算法之前,我们先来回顾一下page是如何进入通用块层的,以ext4文件系统为例,在ext4文件系统中的writeback机制中,回写线程先将page变成buffer head,然后submit_bh函数又将buffer head变成了bio,最后调用submit_bio函数进入通用块层,在submit_bio函数中会调用一个通用的函数make_request_fn将bio提交,在块设备层中make_request_fn是指向blk_queue_bio函数的,通过blk_queue_bio函数才真正进入了IO调度。
言归正传,我们来聊聊deadline算法,deadline算法是为了解决Noop算法带来的饥饿问题,核心思想是把每一个请求设置一个期限,超时就尽快处理,在linux/block/deadline-iosched.c文件中内核定义了一个核心数据结构deadline_data
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2];
struct list_head fifo_list[2];
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2];
unsigned int batching; /* number of sequential requests made */
unsigned int starved; /* times reads have starved writes */
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2];
int fifo_batch;
int writes_starved;
int front_merges;
};
这些字段我一个个来解释,deadline算法定义了两类队列,每类队列又分为读队列和写队列,一类队列就是sort_list,对请求按起始扇区进行排序,用红黑树来组织(rb_root是红黑树的根节点);另一类是fifo_list,就是一个简单的先进先出队列,自然是通过到达时间排序,用循环双向链表list_head组织。然后next_rq代表下一个读/写请求。剩下的字段
用两个机制说明
1. 批量dispatch
deadline算法是在读写方向确定后,将请求一批一批地dispatch,fifo_batch字段定义了批量发送的最大值为16,batching表示当前的发送数,当batching小于16的时候,请求会连续地发送,大于或等于16的时候(batching从0开始所以是0-15)就会启动下一轮批量dispatch。
2. 写请求饥饿线
对于读请求和写请求,内核对它们的处理方式是不一样的,内核对读请求很偏心,fifi_expire这个数组分别存储了读/写请求的期限,读请求是500ms,写请求是5s,不仅如此,即使写请求超时也不一定立即响应,而是等到读请求当前的批次大于写请求饥饿线的时候才去处理写请求,内核如此做是因为读请求是同步的,会阻塞进程,所以必须马上处理,而写请求是异步的,进程可以先去干其它事情,所以实时性要求不高。在deadline_data结构体中,starved表示读请求当前批次,writes_starved表示写请求饥饿线,默认为2。还有一个front_merges字段表示能否进行前向合并的检查。
deadine_data的初始化如下
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT; //红黑树的根节点,初始化为NULL, #define RB_ROOT (struct rb_root) { NULL, }
dd->sort_list[WRITE] = RB_ROOT;
dd->fifo_expire[READ] = read_expire;
dd->fifo_expire[WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
接下来具体分析相关的函数
首先是将bio封装成请求,如何构造请求呢,首先如果能将bio合并到当前某个请求,那自然就不用新建
所以内核会通过elv_merge调用e->type->ops.elevator_merge_fn函数,而这个函数在deadline算法中定义为deadline_merge函数
static int
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
struct request *__rq;
int ret;
/*
* check for front merge
*/
if (dd->front_merges) { //如果可以进行前向合并
sector_t sector = bio_end_sector(bio); //找到bio的结束扇区号
//如果能找到一个请求,它的起始扇区号和sector相同(就是连续的)
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
if (__rq) {
BUG_ON(sector != blk_rq_pos(__rq));
//进行合并,然后成功返回ret,表示前向合并完成
if (elv_rq_merge_ok(__rq, bio)) {
ret = ELEVATOR_FRONT_MERGE;
goto out;
}
}
}
return ELEVATOR_NO_MERGE;
out:
*req = __rq;
return ret;
}
然后还要再检查这个合并了的请求能否再和其它请求合并,合并了再做善后工作,使用deadline_merged_requests函数
static void
deadline_merged_requests(struct request_queue *q, struct request *req,
struct request *next)
{
/*
* if next expires before rq, assign its expire time to rq
* and move into next position (next will be deleted) in fifo
*/
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
if (time_before(next->fifo_time, req->fifo_time)) { //如果next的期限小于req
list_move(&req->queuelist, &next->queuelist); //这个queuelist实际上就是真正发给设备驱动程序的队列,处理完了的队列
req->fifo_time = next->fifo_time; //因为是next比req要先响应,合并完了肯定要以先响应的为准
}
}
/*
* kill knowledge of next, this one is a goner
*/
deadline_remove_request(q, next); //从fifo_list和sort_list删除next
}
然后还得做一些善后工作,deadline_merged_requests函数主要是把合并了的请求从sort_list删除再加入
static void deadline_merged_request(struct request_queue *q,
struct request *req, int type)
{
struct deadline_data *dd = q->elevator->elevator_data;
/*
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
elv_rb_del(deadline_rb_root(dd, req), req);
deadline_add_rq_rb(dd, req);
}
}
到这终于合并完请求了,接下来是加入请求,通过blk_queue_bio调用__elv_add_request,__elv_add_request函数又调用q->elevator->type->ops.elevator_add_req_fn,这个函数在deadline算法中被定义为deadline_add_request/*
* add rq to rbtree and fifo
*/
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq); // 获取请求的方向(direction),读/写
deadline_add_rq_rb(dd, rq); //加入sort_list,rb是readblack的缩写,也就是红黑树
/*
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; //jiffies是系统全局变量
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); //将rq加入fifo_list
}
再看deadline_add_rq_rb这个函数
static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
return &dd->sort_list[rq_data_dir(rq)];
}
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
//获取sort_list中指向根节点的指针root,因为从rb_node开始才是正式的红黑树,root只是指向根节点的指针
struct rb_root *root = deadline_rb_root(dd, rq);
//这个函数会把rq的rb_node指针指向sort_list的一个节点,也就是把每个请求与sort_list关联
elv_rb_add(root, rq);
}
最后是dispatch请求
static int deadline_dispatch_requests(struct request_queue *q, int force)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
//如果有rq并且批量dispatch未到上限,则直接进行dispatch
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
goto dispatch_request;
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
//触发写请求饥饿线,必须处理写请求了
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
//同时把请求方向变为写
data_dir = READ;
goto dispatch_find_request;
}
/*
* there are either no reads or writes have been starved
*/
if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
dd->starved = 0;
data_dir = WRITE;
goto dispatch_find_request;
}
return 0;
dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
//如果fifo_list有超时或者下一个请求的方向变了,就去fifo_list取request,然后还得执行下面的dispatch_request
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time. */
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
} else { // 为什么这里可以直接从fifo_list取而不是sort_list,因为最终都需要通过rq的rb_node到sort_list找
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir]; //否则接着取请求
}
dd->batching = 0; //表示启动新一批请求dispatch
dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++;
deadline_move_request(dd, rq); //从fifo_list和sort_list中删除,再加入req->queuelist中,这里就从IO调度层离开了
return 1;
}