5.4.1 Qemu block driver
(1) block driver注册
bdrv_register 用户驱动注册,raw-posix.c是raw block driver.
block_init(bdrv_file_init); block_init是总的注册函数
bdrv_file_init ==> bdrv_register(&bdrv_file);
void bdrv_register(BlockDriver *bdrv)
{
if(!bdrv->bdrv_co_readv) {
bdrv->bdrv_co_readv = bdrv_co_readv_em;
bdrv->bdrv_co_writev = bdrv_co_writev_em;
if(!bdrv->bdrv_aio_readv) {
bdrv->bdrv_aio_readv= bdrv_aio_readv_em;
bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
}
}
bdrv_co_readv/writev是读写的底层实现函数
static BlockDriver bdrv_file = {
.format_name ="file",
.protocol_name ="file",
.instance_size =sizeof(BDRVRawState),
.bdrv_probe = NULL, /*no probe for protocols */
.bdrv_file_open =raw_open,
.bdrv_reopen_prepare =raw_reopen_prepare,
.bdrv_reopen_commit =raw_reopen_commit,
.bdrv_reopen_abort =raw_reopen_abort,
.bdrv_close = raw_close,
.bdrv_create =raw_create,
.bdrv_co_discard =raw_co_discard,
.bdrv_co_is_allocated= raw_co_is_allocated,
.bdrv_aio_readv =raw_aio_readv,
.bdrv_aio_writev =raw_aio_writev,
.bdrv_aio_flush =raw_aio_flush,
.bdrv_truncate =raw_truncate,
.bdrv_getlength =raw_getlength,
.bdrv_get_allocated_file_size
=raw_get_allocated_file_size,
.create_options =raw_create_options,
};
QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
}
bdrv_file直接读写file.
常用的block driver 还有bdrv_qcow2, 它负责实现了kvm虚拟机镜像文件格式。
(2) block driver匹配
5.2节中分析到main(vl.c)==> drive_init_func==> drive_init(blockdev.c),本节分析该函数流程
a) 通过qemu_opt_get_number得到block device的各种信息,如:
b) file =qemu_opt_get(opts, "file"); 并检察这些参数的合法性。初始化BlockIOLimit io_limits
c) 如果用户在参数中指定了format,则根据format得到drv
drv= bdrv_find_whitelisted_format(buf); 否则drv为空; 我们这里分析为空的case
d) 最后将这些参数存入到DriveInfo*dinfo 中;
QTAILQ_INSERT_TAIL(&drives,dinfo, next);
bdrv_set_on_error(dinfo->bdrv,on_read_error, on_write_error);
bdrv_set_io_limits(dinfo->bdrv,&io_limits);
e) 根据file参数打开drv
ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
a) 如果制定了BDRV_O_SNAPSHOT则先处理这种类别
b) 未指定drv的case调用find_image_format得到drv;
find_image_format
如果是scsi disk或drive为空则直接返回raw drive
if(bs->sg || !bdrv_is_inserted(bs)) {
bdrv_delete(bs);
drv= bdrv_find_format("raw");
if(!drv) {
ret = -ENOENT;
}
*pdrv= drv;
returnret;
}
根据文件名读取文件内容,并调用
QLIST_FOREACH(drv1,&bdrv_drivers, list) {
if(drv1->bdrv_probe) {
score = drv1->bdrv_probe(buf, ret, filename);
if (score > score_max) {
score_max = score;
drv = drv1;
}
}
} 来匹配采用哪种block driver
5.4.2 Block Drive读写
先来回顾一下ide dma的读写:
ide_dma_cb(hw\ide\core.c)==>
s->bus->dma->aiocb = dma_bdrv_read(s->bs,&s->sg, sector_num, ide_dma_cb,s);
dma_bdrv_read==> return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv,cb, opaque,
DMA_DIRECTION_FROM_DEVICE); ==> dma_bdrv_cb
dma_bdrv_cb ==> dbs =qemu_aio_get //dbs->cb = ide_dma_cb
dbs->acb =dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
dbs->iov.size / 512,dma_bdrv_cb, dbs);
io_func = bdrv_aio_readv, cb ==> bdrv_co_aio_rw_vector ==>
a. acb = qemu_aio_get //acb->cb = dma_bdrv_cb
b. co = qemu_coroutine_create(bdrv_co_do_rw);
qemu_coroutine_enter(co, acb);
coroutine协程是一种轻量级并行执行的实现(qeum中winos用纤程实现;
linux用thread实现) bdrv_co_do_rw是coroutine 中执行的函数体。qemu_coroutine_enter开始执行bdrv_co_do_rw。
static void coroutine_fn bdrv_co_do_rw(void *opaque)
{
BlockDriverAIOCBCoroutine *acb = opaque;
BlockDriverState *bs =acb->common.bs;
if (!acb->is_write){
acb->req.error= bdrv_co_do_readv(bs, acb->req.sector,
acb->req.nb_sectors, acb->req.qiov, 0);
} else {
acb->req.error= bdrv_co_do_writev(bs, acb->req.sector,
acb->req.nb_sectors, acb->req.qiov, 0);
}
acb->bh =qemu_bh_new(bdrv_co_em_bh, acb); //
qemu_bh_schedule(acb->bh);
}
bdrv_co_do_readv(/writev是读写的异步执行的启动函数;
bottom half用于提交完成回调 qemu_bh_new(bdrv_co_em_bh, acb);
acb->bh会执行bdrv_co_em_bh
bdrv_co_do_readv ==> drv->bdrv_co_readv = drv->bdrv_co_readv 在 bdrv_register设为了
bdrv_co_readv_em ==> bdrv_co_io_em
static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs,int64_t sector_num,
int nb_sectors,QEMUIOVector *iov,
boolis_write)
{
CoroutineIOCompletionco = {
.coroutine =qemu_coroutine_self(),
};
BlockDriverAIOCB *acb;
if (is_write) {
acb =bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
bdrv_co_io_em_complete, &co);
} else {
acb =bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
bdrv_co_io_em_complete, &co);
}
.......
}
drv->bdrv_aio_readv = raw_aio_readv==> raw_aio_submit ==> paio_submit提交异步io
BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
int64_tsector_num, QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque, int type)
{
struct qemu_paiocb*acb;
acb =qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
acb->aio_type =type;
acb->aio_fildes =fd;
.......
qemu_paio_submit(acb);
return&acb->common;
}
static void qemu_paio_submit(struct qemu_paiocb *aiocb)
{
aiocb->ret =-EINPROGRESS;
aiocb->active = 0;
mutex_lock(&lock);
if (idle_threads == 0&& cur_threads < max_threads)
spawn_thread();
QTAILQ_INSERT_TAIL(&request_list, aiocb, node); //将aiocb加入到aio request链表中
mutex_unlock(&lock);
cond_signal(&cond);
}
整个调用通过callback形成了一个回调链。每层的回调在上文中用黑体标出了。下一节将分析异步aio的执行
5.4.3 Qemu AIO 框架
(1) spawn_thread
上节中的spawn_thread ==> qemu_bh_schedule(new_thread_bh);
在aio框架的初始化函数中 paio_init(posix-aio-compact.c)
qemu_pipe(fds)
s->rfd = fds[0];
s->wfd = fds[1];//aio管理器的fds为管道
qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL,posix_aio_flush, s);
QTAILQ_INIT(&request_list); //初始化aio 的request
new_thread_bh =qemu_bh_new(spawn_thread_bh_fn, NULL);
posix_aio_read用户调用aio的callback
qemu_aio_set_fd_handler负责设置AioHandler *node,
node->io_read =posix_aio_read;
node->io_write= NULL;
node->io_flush= posix_aio_flush;
node->opaque =s;
注册aio管理器的node
spawn_thread_bh_fn ==> do_spawn_thread ==> thread_create(&thread_id,&attr, aio_thread, NULL);
所以spawn_thread最终调用aio_thread
(2) aio_thread
aio_thread是aio的处理框架
a. 从QTAILQ_REMOVE(&request_list, aiocb, node);取出aiocb.
b. 调用handle_aiocb_rw执行读写
handle_aiocb_rw ==> handle_aiocb_rw_linear ==>
pread(aiocb->aio_fildes, buf + offset,
aiocb->aio_nbytes - offset,
aiocb->aio_offset + offset);
aio_fildes为 block_drv对应文件句柄,来发起读写操作
c. posix_aio_notify_event
(3) bh 的执行
那么 qemu_bh_schedule是如何工作的呢?
void qemu_bh_schedule(QEMUBH *bh)
{
if (bh->scheduled)
return;
bh->scheduled = 1;
bh->idle = 0;
qemu_notify_event();//call SetEvent(qemu_event_handle)
}
main_loop(vl.c) ==> main_loop_wait ==> main_loop_wait==>
a) qemu_iohandler_fill根据io_handlers填写readwrite的select 句柄
b) os_host_main_loop_wait调用select
c) qemu_iohandler_poll如果select返回的句柄被set 调用对应的io_read,io_write
d) qemu_bh_poll(async.c) 遍历bh调用bh的处理函数
所以spawn_thread_bh_fn会被调用。
int qemu_bh_poll(void)
{
.....
for (bh = first_bh;bh; bh = next) {
next =bh->next;
if(!bh->deleted && bh->scheduled) {
bh->scheduled = 0;
if(!bh->idle)
ret = 1;
bh->idle =0;
bh->cb(bh->opaque);
}
}
.......
}
下面在来看看aio的 fd handler何时被调用
aio_thread ==> posix_aio_notify_event ==> write(posix_aio_state->wfd,&byte, sizeof(byte));
原来当一次读写完成后会向管道写,这样os_host_main_loop_wait的select 会set posix_aio_state->wfd. qemu_iohandler_poll 就会调用node->io_read = posix_aio_read;
posix_aio_read ==》
*pacb =acb->next;
/* callthe callback */
acb->common.cb(acb->common.opaque, ret);
qemu_aio_release(acb); //会调用acb的完成函数
对于前面读写的例子bdrv_co_io_em_complete会被调用,接着下一个bh的方法dma_bdrv_cb(bdrv_co_do_rw中被schedule)也会被调用,从而dma操作被推动,直达数据处理完成。 其代码如下:
static void dma_bdrv_cb(void *opaque, int ret)
{
.....
if(dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
dma_complete(dbs,ret); //如果数据完成, 在不在提交aio
return;
}
while(dbs->sg_cur_index < dbs->sg->nsg) {
cur_addr =dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
cur_len =dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
mem =dma_memory_map(dbs->sg->dma, cur_addr, &cur_len, dbs->dir);
if (!mem)
break;
qemu_iovec_add(&dbs->iov, mem, cur_len);
dbs->sg_cur_byte += cur_len;
if(dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) {
dbs->sg_cur_byte = 0;
++dbs->sg_cur_index;
}
}
.....
第一次或还有数据剩余则调用bdrv_co_do_rw发起下一次aio
dbs->acb =dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov,
dbs->iov.size / 512, dma_bdrv_cb, dbs);
}