单链表使用 while(p-＞next) 进行判断

原创已于 2025-11-15 01:27:53 修改 · 132 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#链表

于 2025-11-14 22:50:00 首次发布

1, Linux 内核的 fs/buffer.c 文件中的 int __block_write_full_page函数

static int __block_write_full_page(
		struct inode *inode, 
		struct page *page, 
		get_block_t *get_block, 
		struct writeback_control *wbc, 
		bh_end_io_t *handler
		)

该函数用于处理块设备的全页写入操作，包括映射缓冲区、提交异步写入等逻辑。
在较新内核版本（如 6.x 系列）中，部分代码已适配 folio 机制（page 的扩展），但核心结构类似。

代码

tmp = bh->b_this_page; 
while (tmp != bh) 
{ if (buffer_async_write(tmp)) 
{ ... } 
tmp = tmp->b_this_page; 
}

实际位于该函数调用的 I/O 完成处理器 end_buffer_async_write 中，用于检查页上其他缓冲区是否仍处于异步写入状态（以避免竞争条件）。这是一个环形链表遍历（circular linked list），从当前 bh 的下一个开始遍历，直到回到起点，相当于检查“除当前 bh 外的所有缓冲区”。

函数block_write_full_page 调用 __block_write_full_page，
并传递 end_buffer_async_write 作为 handler。

static int __block_write_full_page(struct inode *inode, struct page *page,
			get_block_t *get_block, struct writeback_control *wbc,
			bh_end_io_t *handler)
{
	int err;
	sector_t block;
	sector_t last_block;
	struct buffer_head *bh, *head;
	unsigned int blocksize, bbits;
	int nr_underway = 0;
	int write_flags = wbc_to_write_flags(wbc);

	head = create_page_buffers(page, inode, 0);
	if (!head)
		return 0;

	/*
	 * Be very careful. We have no exclusion from __set_page_dirty_buffers
	 * here, and the (potentially unmapped) buffers may become dirty at
	 * any time. If a buffer becomes dirty here after we've inspected it
	 * then we just miss that fact, and the page stays dirty.
	 *
	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
	 * handle that here by just cleaning them.
	 */

	bh = head;
	blocksize = bh->b_size;
	bbits = block_size_bits(blocksize);

	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
	last_block = (i_size_read(inode) - 1) >> bbits;

	/*
	 * Get all the dirty buffers mapped to disk addresses and
	 * handle any aliases from the underlying blockdev's mapping.
	 */
	do {
		if (block > last_block) {
			/*
			 * mapped buffers outside i_size will occur, because
			 * this page can be outside i_size when there is a
			 * truncate in progress.
			 */
			/*
			 * The buffer was zeroed by block_write_full_page()
			 */
			clear_buffer_dirty(bh);
			set_buffer_uptodate(bh);
		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
			   buffer_dirty(bh)) {
			WARN_ON(bh->b_size != blocksize);
			err = get_block(inode, block, bh, WRITE);
			if (err)
				goto recover;
			clear_buffer_delay(bh);
			if (buffer_new(bh)) {
				/* blockdev mappings never come here */
				clear_buffer_new(bh);
				unmap_underlying_metadata(bh->b_bdev,
							  bh->b_blocknr);
			}
		}
		bh = bh->b_this_page;
		block++;
	} while (bh != head);

	do {
		if (!buffer_mapped(bh))
			continue;
		/*
		 * If it's a fully non-blocking write attempt and we cannot
		 * lock the buffer then redirty the page.  Note that this can
		 * potentially cause a busy-wait loop from writeback threads
		 * and kswapd activity, but those code paths have their own
		 * higher-level throttling.
		 */
		if (wbc->sync_mode != WB_SYNC_NONE) {
			lock_buffer(bh);
		} else if (!trylock_buffer(bh)) {
			redirty_page_for_writepage(wbc, page);
			continue;
		}
		if (test_clear_buffer_dirty(bh)) {
			mark_buffer_async_write_endio(bh, handler);
		} else {
			unlock_buffer(bh);
		}
	} while ((bh = bh->b_this_page) != head);

	/*
	 * The page and its buffers are protected by PageWriteback(), so we can
	 * drop the bh refcounts early.
	 */
	BUG_ON(PageWriteback(page));
	set_page_writeback(page);
	unlock_page(page);

	do {
		struct buffer_head *next = bh->b_this_page;
		if (buffer_async_write(bh)) {
			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
			nr_underway++;
		}
		bh = next;
	} while (bh != head);

	err = 0;

done:
	if (nr_underway == 0) {
		/*
		 * The page was marked dirty, but the buffers were
		 * clean.  Someone wrote them back by hand with
		 * ll_rw_block/submit_bh.  A rare case.
		 */
		end_page_writeback(page);

		/*
		 * The page and buffer_heads can be released at any time from
		 * here on.
		 */
	}
	return err;

recover:
	/*
	 * ENOSPC, or some other error.  We may already have added some
	 * blocks to the file, so we need to write these out to avoid
	 * exposing stale data.
	 * The page is currently locked and not marked for writeback
	 */
	bh = head;
	/* Recovery: lock and submit the mapped buffers */
	do {
		if (buffer_mapped(bh) && buffer_dirty(bh) &&
		    !buffer_delay(bh)) {
			lock_buffer(bh);
			mark_buffer_async_write_endio(bh, handler);
		} else {
			/*
			 * The buffer may have been set dirty during
			 * attachment to a dirty page.
			 */
			clear_buffer_dirty(bh);
		}
	} while ((bh = bh->b_this_page) != head);
	SetPageError(page);
	BUG_ON(PageWriteback(page));
	mapping_set_error(page->mapping, err);
	set_page_writeback(page);
	do {
		struct buffer_head *next = bh->b_this_page;
		if (buffer_async_write(bh)) {
			clear_buffer_dirty(bh);
			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
			nr_underway++;
		}
		bh = next;
	} while (bh != head);
	unlock_page(page);
	goto done;
}

__block_write_full_page 在其传入的 handler 函数 end_buffer_async_write（fs/buffer.c 中定义）中，使用了类似while(p->next) 循环，用于 I/O 完成回调时检查页上其他缓冲区是否仍忙碌（still_busy）。这是为了确保所有缓冲区完成写入后才结束 writeback。

static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
	char *label;
	unsigned long flags;
	struct buffer_head *first;
	struct buffer_head *tmp;
	struct page *page;

	BUG_ON(!buffer_async_write(bh));

	page = bh->b_page;
	if (uptodate) {
		set_buffer_uptodate(bh);
		label = " (uptodate)";
	} else {
		buffer_io_error(bh, " (lost async write)");
		mark_buffer_write_io_error(bh);
		clear_buffer_uptodate(bh);
		SetPageError(page);
		label = " (write error)";
	}

	first = page_buffers(page);
	local_irq_save(flags);
	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
	clear_buffer_async_write(bh);
	unlock_buffer(bh);
	tmp = bh->b_this_page;  // 从下一个 bh 开始
	while (tmp != bh) {     // 环形遍历，直到回到当前 bh（检查除当前外的所有）
     // 等价于 while (tmp->b_this_page != NULL) 在环形检查前
		if (buffer_async_write(tmp)) {
			BUG_ON(!buffer_locked(tmp));
			goto still_busy;  // 如果其他缓冲区还在异步写入，保持忙碌状态
		}
		tmp = tmp->b_this_page;
	}
	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
	local_irq_restore(flags);
	end_page_writeback(page);
	return;

still_busy:
	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
	local_irq_restore(flags);
	return;
}

循环解释：

b_this_page 形成环形链表（head 的下一个是第一个，最后一个的下一个是 head）。
该循环从 bh->b_this_page 开始，while (tmp != bh) 确保不检查当前 bh，只检查“兄弟”缓冲区。
等价于线性链表的 while (tmp != NULL)，但适应环形结构，避免无限循环。
如果发现其他 tmp 仍在异步写入（buffer_async_write(tmp)），则跳转到 still_busy，延迟结束 writeback。

2, net/ipv4/ip_fragment.c 文件中ip_frag_reasm函数（适用于较旧的内核版本，例如 3.7 系列）

static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,                        struct net_device *dev)
{
    struct sk_buff *fp; 
        ...    
    for (fp = prev->next; fp != NULL; fp = fp->next) 
    {  
        // for等价while (fp->next != NULL)
                ...  
     }
}

此代码片段匹配该函数的签名和内部逻辑（包括遍历 sk_buff 链表的循环），但当前内核（6.x 系列）已重构该函数，参数和实现有所变化。

以下是 Linux 内核 3.7 版本中该函数的完整源代码（已去除行号，保持原始 C 代码格式）。注意，循环部分使用 for (fp = head->next; fp; )，这与上面的片段类似（head 在上下文中可能等价于 prev 的后续处理结果）。
注释“for 等价 while (fp->next != NULL)”不完全准确，因为 for 循环会处理最后一个非 NULL 的 fp，而 while (fp->next != NULL) 会跳过最后一个片段。

static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
{
	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
	struct iphdr *iph;
	struct sk_buff *fp, *head = qp->q.fragments;
	int len;
	int ihlen;
	int err;
	int sum_truesize;
	u8 ecn;

	ipq_kill(qp);

	ecn = ip4_frag_ecn_table[qp->ecn];
	if (unlikely(ecn == 0xff)) {
		err = -EINVAL;
		goto out_fail;
	}
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
		if (!fp->next)
			qp->q.fragments_tail = fp;
		prev->next = fp;

		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;

		consume_skb(qp->q.fragments);
		qp->q.fragments = head;
	}

	WARN_ON(head == NULL);
	WARN_ON(FRAG_CB(head)->offset != 0);

	/* Allocate a new buffer for the datagram. */
	ihlen = ip_hdrlen(head);
	len = ihlen + qp->q.len;

	err = -E2BIG;
	if (len > 65535)
		goto out_oversize;

	/* Head of list must not be cloned. */
	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
	if (skb_has_frag_list(head)) {
		struct sk_buff *clone;
		int i, plen = 0;

		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
		skb_frag_list_init(head);
		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
		atomic_add(clone->truesize, &qp->q.net->mem);
	}

	skb_push(head, head->data - skb_network_header(head));

	sum_truesize = head->truesize;

  // for等价while (fp->next != NULL)
  
	for (fp = head->next; fp;) {
		bool headstolen;
		int delta;
		struct sk_buff *next = fp->next;

		sum_truesize += fp->truesize;
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
		else if (head->ip_summed == CHECKSUM_COMPLETE)
			head->csum = csum_add(head->csum, fp->csum);

		if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
			kfree_skb_partial(fp, headstolen);
		} else {
			if (!skb_shinfo(head)->frag_list)
				skb_shinfo(head)->frag_list = fp;
			head->data_len += fp->len;
			head->len += fp->len;
			head->truesize += fp->truesize;
		}
		fp = next;
	}
	atomic_sub(sum_truesize, &qp->q.net->mem);

	head->next = NULL;
	head->dev = dev;
	head->tstamp = qp->q.stamp;
	IPCB(head)->frag_max_size = qp->q.max_size;

	iph = ip_hdr(head);
	/* max_size != 0 implies at least one fragment had IP_DF set */
	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
	iph->tot_len = htons(len);
	iph->tos |= ecn;

	qp->q.fragments = NULL;
	qp->q.fragments_tail = NULL;
	return 0;

out_nomem:
	LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
		       qp);
	err = -ENOMEM;
	goto out_fail;
out_oversize:
	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
	return err;
}

说明

位置：net/ipv4/ip_fragment.c，函数用于 IP 分片重组（reassembly）。
版本差异：在较新内核（如 4.4+）中，该函数被重构，使用红黑树（RB tree）管理碎片队列，参数变为 ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev, int *refs)，循环逻辑移至通用碎片处理函数。
循环分析：最上面的（文章开头的） for 循环用于遍历并合并后续 sk_buff 片段。
如果要等价转换为 while，应为 fp = head->next; while (fp != NULL) { ... fp = fp->next; }，而非 while (fp->next != NULL)（后者会遗漏最后一个片段）。
来源：（Linux 内核 3.7 文档）。