注:本文分析基于linux-4.18.0-193.14.2.el8_2内核版本,即CentOS 8.2
1 buffer_head
当进程直接读写块设备时,比如超级块和索引节点,就需要把块数据放入内存,我们上一篇讲page cache是将文件数据放入内存,因此不适用,这时就需要用到块缓冲区。每个块缓冲区都对应一个buffer_head类型的缓冲区首部描述符。数据依然存放在page页面中,只不过由buffer_head管理,这种情况下page页面被称为缓冲区页。
2 struct buffer_head主要成员变量
struct buffer_head {
unsigned long b_state; //buffer的状态
struct buffer_head *b_this_page;//该page的下一个buffer
struct page *b_page; //buffer所在page
sector_t b_blocknr; //相对于block device起始位置的logical block number
size_t b_size; /* size of mapping */
char *b_data; //指向数据在page中的位置
struct block_device *b_bdev; //对应的块设备
bh_end_io_t *b_end_io; /* I/O completion */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
struct address_space *b_assoc_map; /* mapping this buffer is associated with */
atomic_t b_count; /* users using this buffer_head */
};
可见,buffer_head描述的是磁盘block和内存buffer之间的映射关系。
3 创建buffer_head
缓冲区页的使用主要有一下两种场景,
- 直接读取块设备(super_block或inode)
- 读写的文件页在磁盘中不相邻,或者存在文件洞
3.1 直接读取块设备
我们先来看下第一种情况,我们以ext4文件系统的mkdir来大概梳理下流程,
ext4_mkdir ->
ext4_new_inode_start_handle ->
__ext4_new_inode -> ------------------------------------------- //为新目录分配inode索引
new_inode ->
ext4_read_inode_bitmap -> --------------------------------- //获取磁盘的inode位图
sb_getblk -> ------------------------------------------ //读取块设备数据
__getblk_gfp ->
__find_get_block ->
lookup_bh_lru ----------------------------- //在每CPU变量bh_lrus中查找BH
__find_get_block_slow -> ------------------ //bh_lrus没找到就要到对应的page cache中查找页面
find_get_page_flags ->
pagecache_get_page -> ------------- //查找page cache
find_get_entry ---------------- //根据bdev->bd_inode->i_mapping地址空间在page cache基树中查找页面
page_buffers ------------------ //找到page cache看是否有对应的buffer_head,没有则返回NULL
bh_lru_install ---------------------------- //如果有找到,把找到的bh放入每CPU bh_lrus中,提高访问速度
__getblk_slow -> ------------------------------ //CPU变量bh_lrus和page cache中都没有找到目标BH,就需要从块设备读取了
grow_buffers ->
grow_dev_page ->
find_or_create_page ->
pagecache_get_page -> --------- //根据bdev->bd_inode->i_mapping地址空间在page cache基树中查找页面
find_get_entry
__page_cache_alloc -------- //没找到page cache,创建新页面
add_to_page_cache_lru ----- //并加入page cache基树,以及LRU链表
alloc_page_buffers -> ------------- //找到或新建的页面没有buffer_head,创建新的buffer_head
alloc_buffer_head ->
kmem_cache_zalloc(bh_cachep //在slab中分配空闲buffer_head对象
set_bh_page ------------------- //设置buffer的数据指向地址
link_dev_buffers -> --------------- //将page对应的所有buffer_head连成一个环形链表
attach_page_buffers ----------- //将buffer关联到对应的page上,将page->private指向buffer_head
分配buffer_head的操作在alloc_page_buffers,前提是page cache没有对应的buffer_head。
- 分配以页大小为总量,每个块大小取决于blocksize大小
- 分配后将各个buffer_head用b_this_page串成链表
- 通过set_bh_page设置各个buffer_head的数据指向地址,该地址也就是page对应的虚拟地址,每个buffer_head还会保存相对页面的偏移地址。
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry)
{
...
head = NULL;
offset = PAGE_SIZE;
//以页大小作为总量,分配buffer_head,而buffer大小由blocksize决定
//因此对于blocksize为4k的设备,此时就只会分配一个buffer_head
//对于blocksize为1k的设备,就会分配4个buffer_head
while ((offset -= size) >= 0) {
//从slab缓存中分配空闲buffer_head结构
bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
//对于分配多个buffer_head的场景,通过b_this_page将其连接起来
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
set_bh_page(bh, page, offset);//设置buffer对应的数据地址
}
...
return head; //链表头,即最后分配的buffer_head
...
}
void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset)
{
bh->b_page = page; //b_page指向对应页面
BUG_ON(offset >= PAGE_SIZE);
if (PageHighMem(page))
bh->b_data = (char *)(0 + offset);
else
bh->b_data = page_address(page) + offset; //设置数据指向地址
}
alloc_page_buffers返回上层函数后,还需要link_dev_buffers进一步处理,
- 将buffer_head进一步头尾相连,变成环形链表
- 将buffer关联到对应的page上,page->private指向buffer_head头部
static inline void link_dev_buffers(struct page *page, struct buffer_head *head)
{
struct buffer_head *bh, *tail;
//将page对应的所有buffer_head连成一个环形链表
bh = head;
do {
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
//将buffer关联到对应的page上
attach_page_buffers(page, head);
}
static inline void attach_page_buffers(struct page *page, struct buffer_head *head)
{
get_page(page);
SetPagePrivate(page); //设置PG_Private标志,表示有page有对应fs的数据,即buffer
set_page_private(page, (unsigned long)head); //将page->private指向buffer_head
}
3.2 读的文件页在磁盘中不相邻
在上篇文章——页缓存page cache和地址空间address_space中,我们知道如果page cache没有缓存,会调用readpage去磁盘读取数据,对于ext4调用的就是ext4_readpage,ext4_readpage其实是对ext4_mpage_readpages的简单封装,
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
//对于扇区大小为512字节的磁盘,该值为9
const unsigned blkbits = inode->i_blkbits;
//对于扇区大小为512字节的磁盘,该值为8,即每个页面对应8个磁盘块
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits; //物理磁盘块大小,即扇区大小,为512字节
...
for (; nr_pages; nr_pages--) {
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
prefetchw(&page->flags);
//如果page有关联的buffer_head,那继续以块的方式读取
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);//当前page在file中的相对block
last_block = block_in_file + nr_pages * blocks_per_page; //需要读取的最后一个block
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; //该文件的最后一个block
if (last_block > last_block_in_file)
last_block = last_block_in_file; //读取的block不能超过文件最后一个block
page_block = 0;
...
//调用ext4_map_blocks查找该页需要的所有磁盘块
while (page_block < blocks_per_page) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file; //读取长度
//从磁盘查找块
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
...
}
}
...
//两次读取的block是否相邻,不相邻则通过一次读一块的方式读取
//但是对于4k盘,即物理块扇区大小为4k,和page大小一致时,就不存在是否相邻的问题,因为一个页就对应一个block
if (page_block && blocks[page_block-1] != map.m_pblk-1)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
}
...
confused:
...
if (!PageUptodate(page))
//通过buffer_head,一次一块读取文件
block_read_full_page(page, ext4_get_block);
...
}
...
return 0;
}
然后block_read_full_page就会调用create_empty_buffers创建buffer_head,和上面直接读取块设备一样,最后读取磁盘的文件数据。
3.3 写操作
对于ext4文件系统,写操作都会经过buffer_head,
SYSCALL_DEFINE3(write //write系统调用入口
ksys_write
vfs_write
__vfs_write
new_sync_write
call_write_iter
file->f_op->write_iter
ext4_file_write_iter
__generic_file_write_iter
generic_perform_write
a_ops->write_begin
ext4_write_begin
grab_cache_page_write_begin ->
pagecache_get_page ->
find_get_entry ---------------- //查找page cache
__page_cache_alloc ------------ //没找到page cache,则分配一个page对象
add_to_page_cache_lru --------- //将页面加入page cache基树中,同时也加入active LRU链表
__block_write_begin
__block_write_begin_int
create_page_buffers
create_empty_buffers ------ //page没有对应的buffer,创建新的buffer
alloc_page_buffers
alloc_buffer_head
kmem_cache_zalloc(bh_cachep //从slab缓存中分配空闲buffer_head结构
set_bh_page ------- //设置buffer_head数据指向地址
attach_page_buffers
iov_iter_copy_from_user_atomic ---------------- //将数据从用户空间拷贝到内核空间,也就是page cache上
a_ops->write_end
ext4_write_end
block_write_end
__block_commit_write
mark_buffer_dirty
ext4_update_inode_size ---------------- //更新文件对应的inode信息
ext4_mark_inode_dirty ----------------- //标记inode为脏,写入了数据,需要同步到磁盘
4 删除buffer_head
因为buffer_head和page关联,因此在回收page的时候会同时回收buffer_head,同样在上篇文章——页缓存page cache和地址空间address_space中我们提到drop_caches时,针对ext4文件系统会调用ext4_releasepage对资源进行释放,
ext4_releasepage ->
jbd2_journal_try_to_free_buffers ->
try_to_free_buffers ->
drop_buffers ->
__clear_page_buffers //清空page->private的指向
free_buffer_head ->
kmem_cache_free(bh_cachep //将buffer_head释放回slab缓存
5 结构关系
以写/home/test.c文件为例,
- 应用程序打开test.c文件,并通过fd文件描述符获得file结构体
- file结构体中f_inode指向文件对应的inode结构
- file结构体中f_path的dentry指向文件对应的dentry对象
- dentry对象中的d_inode也指向该文件对应的inode结构
- file结构体的f_mapping指向inode的i_mapping
- inode的i_mapping则指向inode内嵌的address_space结构体
- 并且address_space结构体也有一个变量——host,指向对应的inode结构
- address_space结构体的i_pages指向page cache基树
- page cache基树的结点中slot指针数组指向page对象
- 此时count=2,即说名test.c文件在内存中有两个高速缓存页,page地址保存在slot指针数组中
- page结构体中的mapping指向file结构体的f_mapping
- page结构体中的private指向buffer_head的首部
- buffer_head的数量取决于物理扇区大小,对于sector size为512bytes,就会有8个buffer_head,(page size/sector size)
- 所有buffer_head通过b_this_page连接,并形成一个环形链表
- buffer_head结构体中b_page指向buffer对应的实际页面地址,即虚拟地址