1, Linux 内核的 fs/buffer.c 文件中 的 int __block_write_full_page函数
static int __block_write_full_page(
struct inode *inode,
struct page *page,
get_block_t *get_block,
struct writeback_control *wbc,
bh_end_io_t *handler
)
该函数用于处理块设备的全页写入操作,包括映射缓冲区、提交异步写入等逻辑。
在较新内核版本(如 6.x 系列)中,部分代码已适配 folio 机制(page 的扩展),但核心结构类似。
代码
tmp = bh->b_this_page;
while (tmp != bh)
{ if (buffer_async_write(tmp))
{ ... }
tmp = tmp->b_this_page;
}
实际位于该函数调用的 I/O 完成处理器 end_buffer_async_write 中,用于检查页上其他缓冲区是否仍处于异步写入状态(以避免竞争条件)。这是一个环形链表遍历(circular linked list),从当前 bh 的下一个开始遍历,直到回到起点,相当于检查“除当前 bh 外的所有缓冲区”。
函数block_write_full_page 调用 __block_write_full_page,
并传递 end_buffer_async_write 作为 handler。
static int __block_write_full_page(struct inode *inode, struct page *page,
get_block_t *get_block, struct writeback_control *wbc,
bh_end_io_t *handler)
{
int err;
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
int write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode, 0);
if (!head)
return 0;
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
* handle that here by just cleaning them.
*/
bh = head;
blocksize = bh->b_size;
bbits = block_size_bits(blocksize);
block = (sector_t)page->index << (PAGE_SHIFT - bbits);
last_block = (i_size_read(inode) - 1) >> bbits;
/*
* Get all the dirty buffers mapped to disk addresses and
* handle any aliases from the underlying blockdev's mapping.
*/
do {
if (block > last_block) {
/*
* mapped buffers outside i_size will occur, because
* this page can be outside i_size when there is a
* truncate in progress.
*/
/*
* The buffer was zeroed by block_write_full_page()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, WRITE);
if (err)
goto recover;
clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
}
}
bh = bh->b_this_page;
block++;
} while (bh != head);
do {
if (!buffer_mapped(bh))
continue;
/*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
* potentially cause a busy-wait loop from writeback threads
* and kswapd activity, but those code paths have their own
* higher-level throttling.
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh);
} else if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
if (test_clear_buffer_dirty(bh)) {
mark_buffer_async_write_endio(bh, handler);
} else {
unlock_buffer(bh);
}
} while ((bh = bh->b_this_page) != head);
/*
* The page and its buffers are protected by PageWriteback(), so we can
* drop the bh refcounts early.
*/
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
} while (bh != head);
err = 0;
done:
if (nr_underway == 0) {
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
* ll_rw_block/submit_bh. A rare case.
*/
end_page_writeback(page);
/*
* The page and buffer_heads can be released at any time from
* here on.
*/
}
return err;
recover:
/*
* ENOSPC, or some other error. We may already have added some
* blocks to the file, so we need to write these out to avoid
* exposing stale data.
* The page is currently locked and not marked for writeback
*/
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write_endio(bh, handler);
} else {
/*
* The buffer may have been set dirty during
* attachment to a dirty page.
*/
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
SetPageError(page);
BUG_ON(PageWriteback(page));
mapping_set_error(page->mapping, err);
set_page_writeback(page);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
} while (bh != head);
unlock_page(page);
goto done;
}
__block_write_full_page 在其传入的 handler 函数 end_buffer_async_write(fs/buffer.c 中定义)中,使用了类似while(p->next) 循环,用于 I/O 完成回调时检查页上其他缓冲区是否仍忙碌(still_busy)。这是为了确保所有缓冲区完成写入后才结束 writeback。
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
char *label;
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
struct page *page;
BUG_ON(!buffer_async_write(bh));
page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
label = " (uptodate)";
} else {
buffer_io_error(bh, " (lost async write)");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
label = " (write error)";
}
first = page_buffers(page);
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
clear_buffer_async_write(bh);
unlock_buffer(bh);
tmp = bh->b_this_page; // 从下一个 bh 开始
while (tmp != bh) { // 环形遍历,直到回到当前 bh(检查除当前外的所有)
// 等价于 while (tmp->b_this_page != NULL) 在环形检查前
if (buffer_async_write(tmp)) {
BUG_ON(!buffer_locked(tmp));
goto still_busy; // 如果其他缓冲区还在异步写入,保持忙碌状态
}
tmp = tmp->b_this_page;
}
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
end_page_writeback(page);
return;
still_busy:
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
return;
}
循环解释:
b_this_page形成环形链表(head 的下一个是第一个,最后一个的下一个是 head)。- 该循环从
bh->b_this_page开始,while (tmp != bh)确保不检查当前 bh,只检查“兄弟”缓冲区。 - 等价于线性链表的
while (tmp != NULL),但适应环形结构,避免无限循环。 - 如果发现其他 tmp 仍在异步写入(
buffer_async_write(tmp)),则跳转到still_busy,延迟结束 writeback。
2, net/ipv4/ip_fragment.c 文件中ip_frag_reasm函数(适用于较旧的内核版本,例如 3.7 系列)
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev)
{
struct sk_buff *fp;
...
for (fp = prev->next; fp != NULL; fp = fp->next)
{
// for等价while (fp->next != NULL)
...
}
}
此代码片段匹配该函数的签名和内部逻辑(包括遍历 sk_buff 链表的循环),但当前内核(6.x 系列)已重构该函数,参数和实现有所变化。
以下是 Linux 内核 3.7 版本中该函数的完整源代码(已去除行号,保持原始 C 代码格式)。注意,循环部分使用 for (fp = head->next; fp; ),这与上面的片段类似(head 在上下文中可能等价于 prev 的后续处理结果)。
注释“for 等价 while (fp->next != NULL)”不完全准确,因为 for 循环会处理最后一个非 NULL 的 fp,而 while (fp->next != NULL) 会跳过最后一个片段。
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
struct sk_buff *fp, *head = qp->q.fragments;
int len;
int ihlen;
int err;
int sum_truesize;
u8 ecn;
ipq_kill(qp);
ecn = ip4_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
}
/* Make the one we just received the head. */
if (prev) {
head = prev->next;
fp = skb_clone(head, GFP_ATOMIC);
if (!fp)
goto out_nomem;
fp->next = head->next;
if (!fp->next)
qp->q.fragments_tail = fp;
prev->next = fp;
skb_morph(head, qp->q.fragments);
head->next = qp->q.fragments->next;
consume_skb(qp->q.fragments);
qp->q.fragments = head;
}
WARN_ON(head == NULL);
WARN_ON(FRAG_CB(head)->offset != 0);
/* Allocate a new buffer for the datagram. */
ihlen = ip_hdrlen(head);
len = ihlen + qp->q.len;
err = -E2BIG;
if (len > 65535)
goto out_oversize;
/* Head of list must not be cloned. */
if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
goto out_nomem;
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
goto out_nomem;
clone->next = head->next;
head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
head->data_len -= clone->len;
head->len -= clone->len;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
atomic_add(clone->truesize, &qp->q.net->mem);
}
skb_push(head, head->data - skb_network_header(head));
sum_truesize = head->truesize;
// for等价while (fp->next != NULL)
for (fp = head->next; fp;) {
bool headstolen;
int delta;
struct sk_buff *next = fp->next;
sum_truesize += fp->truesize;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
kfree_skb_partial(fp, headstolen);
} else {
if (!skb_shinfo(head)->frag_list)
skb_shinfo(head)->frag_list = fp;
head->data_len += fp->len;
head->len += fp->len;
head->truesize += fp->truesize;
}
fp = next;
}
atomic_sub(sum_truesize, &qp->q.net->mem);
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = qp->q.max_size;
iph = ip_hdr(head);
/* max_size != 0 implies at least one fragment had IP_DF set */
iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return 0;
out_nomem:
LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
qp);
err = -ENOMEM;
goto out_fail;
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
return err;
}
说明
- 位置:
net/ipv4/ip_fragment.c,函数用于 IP 分片重组(reassembly)。 - 版本差异:在较新内核(如 4.4+)中,该函数被重构,使用红黑树(RB tree)管理碎片队列,参数变为
ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs),循环逻辑移至通用碎片处理函数。 - 循环分析:最上面的(文章开头的)
for循环用于遍历并合并后续sk_buff片段。
如果要等价转换为while,应为fp = head->next; while (fp != NULL) { ... fp = fp->next; },而非while (fp->next != NULL)(后者会遗漏最后一个片段)。 - 来源:(Linux 内核 3.7 文档)。
2281

被折叠的 条评论
为什么被折叠?



