ext4 buddy块分配算法源码剖析

原创已于 2023-07-23 23:07:56 修改 · 416 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#linux #linux操作系统 #内核 #文件系统

于 2023-07-23 23:07:38 首次发布

文件系统与存储专栏收录该内容

19 篇文章

订阅专栏

文章详细介绍了ext4文件系统中用于块分配的ext4_mb_regular_allocator函数，该函数首先尝试从目标块分配，然后按照不同严格程度（cr值）搜索空闲块，考虑了2的幂次方分配和RAID条带化的优化。在找不到满足条件的连续空闲块时，会尝试找到最佳的空闲块进行分配，以避免长时间搜索导致的性能损失。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

概述

ext4 buddy块分配算法的函数是ext4_mb_regular_allocator，阅读本文之前需要先看下ext4 mballoc之buddy算法_nginux的博客-优快云博客

ext4_mb_regular_allocator源码


static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
	ext4_group_t prefetch_grp = 0, ngroups, group, i;
	int cr = -1;
	int err = 0, first_err = 0;
	unsigned int nr = 0, prefetch_ios = 0;
	struct ext4_sb_info *sbi;
	struct super_block *sb;
	struct ext4_buddy e4b;
	int lost;

	sb = ac->ac_sb;
	sbi = EXT4_SB(sb);
	ngroups = ext4_get_groups_count(sb);
	/* non-extent files are limited to low blocks/groups */
	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
		ngroups = sbi->s_blockfile_groups;

	BUG_ON(ac->ac_status == AC_STATUS_FOUND);

	/* first, try the goal */
    //首先尝试从goal分配物理块，如果分配到goto out返回
	err = ext4_mb_find_by_goal(ac, &e4b);
	if (err || ac->ac_status == AC_STATUS_FOUND)
		goto out;

	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
		goto out;

	/*
	 * ac->ac_2order is set only if the fe_len is a power of 2
	 * if ac->ac_2order is set we also set criteria to 0 so that we
	 * try exact allocation using buddy.
	 */
	i = fls(ac->ac_g_ex.fe_len);
	ac->ac_2order = 0;
	/*
	 * We search using buddy data only if the order of the request
	 * is greater than equal to the sbi_s_mb_order2_reqs
	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
	 * We also support searching for power-of-two requests only for
	 * requests upto maximum buddy size we have constructed.
	 */
	if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
		/*
		 * This should tell if fe_len is exactly power of 2
		 */
        //申请分配的数量刚好是2的N次方,比如申请1024个block，那么ac->ac_2order = 10
		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
			ac->ac_2order = array_index_nospec(i - 1,
							   sb->s_blocksize_bits + 2);
	}

	/* if stream allocation is enabled, use global goal */
    //由于goal目标物理块无法申请到空间，那么从文件系统上次分配的地方开始分配
	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
		/* TBD: may be hot point */
		spin_lock(&sbi->s_md_lock);
		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
		spin_unlock(&sbi->s_md_lock);
	}

	/* Let's just scan groups to find more-less suitable blocks */
	cr = ac->ac_2order ? 0 : 1;
    //申请有个严苛程度的概念，如果是2^N方申请量，那么就用cr=0代表精确分配请求数量的物理物理块
	/*
	 * cr == 0 try to get exact allocation,
	 * cr == 3  try to get anything
	 */
repeat:
	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
		ac->ac_criteria = cr;
		/*
		 * searching for the right group start
		 * from the goal value specified
		 */
		group = ac->ac_g_ex.fe_group;
		prefetch_grp = group;

		for (i = 0; i < ngroups; group++, i++) {
			int ret = 0;
			cond_resched();
			/*
			 * Artificially restricted ngroups for non-extent
			 * files makes group > ngroups possible on first loop.
			 */
			if (group >= ngroups)
				group = 0;

	        ...

			/* This now checks without needing the buddy page */
			ret = ext4_mb_good_group_nolock(ac, group, cr);
			if (ret <= 0) {
				if (!first_err)
					first_err = ret;
				continue;
			}

			err = ext4_mb_load_buddy(sb, group, &e4b);
			if (err)
				goto out;

			ext4_lock_group(sb, group);

			/*
			 * We need to check again after locking the
			 * block group
			 */
			ret = ext4_mb_good_group(ac, group, cr);
			if (ret == 0) {
				ext4_unlock_group(sb, group);
				ext4_mb_unload_buddy(&e4b);
				continue;
			}

            //到这里说明已经找到good group来申请空间
			ac->ac_groups_scanned++;
			if (cr == 0)
                //申请的block数量正好是2的N次方
				ext4_mb_simple_scan_group(ac, &e4b);
			else if (cr == 1 && sbi->s_stripe &&
					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
                //要分配的长度是stripe的整数倍，这是对raid的优化
				ext4_mb_scan_aligned(ac, &e4b);
			else
                //遍历block group内的所有空闲空间段，然后找出最合适的空闲空间段
				ext4_mb_complex_scan_group(ac, &e4b);

			ext4_unlock_group(sb, group);
			ext4_mb_unload_buddy(&e4b);

			if (ac->ac_status != AC_STATUS_CONTINUE)
				break;
		}
	}

    //走到这里说明最终也没有分配到空间
	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
		/*
		 * We've been searching too long. Let's try to allocate
		 * the best chunk we've found so far
		 */
        // 有空闲空间就行
		ext4_mb_try_best_found(ac, &e4b);

        //再尝试遍历一次所有的block group有空闲空间就行
		if (ac->ac_status != AC_STATUS_FOUND) {
			/*
			 * Someone more lucky has already allocated it.
			 * The only thing we can do is just take first
			 * found block(s)
			 */
			lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
			mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
				 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
				 ac->ac_b_ex.fe_len, lost);

			ac->ac_b_ex.fe_group = 0;
			ac->ac_b_ex.fe_start = 0;
			ac->ac_b_ex.fe_len = 0;
			ac->ac_status = AC_STATUS_CONTINUE;
			ac->ac_flags |= EXT4_MB_HINT_FIRST;
			cr = 3;
			goto repeat;
		}
	}
out:
	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
		err = first_err;

	mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
		 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
		 ac->ac_flags, cr, err);

	if (nr)
		ext4_mb_prefetch_fini(sb, prefetch_grp, nr);

	return err;
}

ext4_mb_find_by_goal


static noinline_for_stack
int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
				struct ext4_buddy *e4b)
{
	ext4_group_t group = ac->ac_g_ex.fe_group;
	int max;
	int err;
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
	struct ext4_free_extent ex;

	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
		return 0;
	if (grp->bb_free == 0)
		return 0;

	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
	if (err)
		return err;

	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
		ext4_mb_unload_buddy(e4b);
		return 0;
	}

	ext4_lock_group(ac->ac_sb, group);
    //根据buddy bitmap，查找已ac_g_ex的起始物理块号fe_start开始的空闲区域长度，最少
    //要>=fe_len,当然也有可能不存在这么大的连续空闲区间。
	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
			     ac->ac_g_ex.fe_len, &ex);
	ex.fe_logical = 0xDEADFA11; /* debug value */

    //这个分支是ext4文件系统对raid的优化，当分配的起始地址和长度都对齐到stripe时才分配
	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
		ext4_fsblk_t start;

		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
			ex.fe_start;
		/* use do_div to get remainder (would be 64-bit modulo) */
		if (do_div(start, sbi->s_stripe) == 0) {
			ac->ac_found++;
			ac->ac_b_ex = ex;
			ext4_mb_use_best_found(ac, e4b);
		}
	} else if (max >= ac->ac_g_ex.fe_len) {
        //分配成功
		BUG_ON(ex.fe_len <= 0);
		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
		ac->ac_found++;
		ac->ac_b_ex = ex;n
		ext4_mb_use_best_found(ac, e4b);
	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
		/* Sometimes, caller may want to merge even small
		 * number of blocks to an existing extent */
		BUG_ON(ex.fe_len <= 0);
		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
		ac->ac_found++;
		ac->ac_b_ex = ex;
		ext4_mb_use_best_found(ac, e4b);
	}
	ext4_unlock_group(ac->ac_sb, group);
	ext4_mb_unload_buddy(e4b);

	return 0;
}

mb_find_extent源码


static int mb_find_extent(struct ext4_buddy *e4b, int block,
				int needed, struct ext4_free_extent *ex)
{
	int next = block;
	int max, order;
	void *buddy;

	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
	BUG_ON(ex == NULL);

	buddy = mb_find_buddy(e4b, 0, &max);
	BUG_ON(buddy == NULL);
	BUG_ON(block >= max);
    //先遍历0 order的bitmap，如果物理块号block已经分配，直接返回
	if (mb_test_bit(block, buddy)) {
		ex->fe_len = 0;
		ex->fe_start = 0;
		ex->fe_group = 0;
		return 0;
	}

	/* find actual order */
    //order表示的是从block位置开始的空闲的阶（空闲的最小order)
	order = mb_find_order_for_block(e4b, block);
	block = block >> order;

	ex->fe_len = 1 << order;
	ex->fe_start = block << order;
	ex->fe_group = e4b->bd_group;

	/* calc difference from given start */
	next = next - ex->fe_start;
	ex->fe_len -= next;
	ex->fe_start += next;

	while (needed > ex->fe_len &&
	       mb_find_buddy(e4b, order, &max)) {

		if (block + 1 >= max)
			break;

		next = (block + 1) * (1 << order);
		if (mb_test_bit(next, e4b->bd_bitmap))
			break;

		order = mb_find_order_for_block(e4b, next);

		block = next >> order;
		ex->fe_len += 1 << order;
	}

    ...
	return ex->fe_len;
}

函数返回的ex->fe_len是从block物理块号开始的最大连续空间块长度，有可能>=needed参数代表的请求数量，也可能小于needed，即无法从goal目标物理块block开始的连续空闲块不满足needed的申请要求。

ext4_mb_good_group_nolock函数


/*
 * This could return negative error code if something goes wrong
 * during ext4_mb_init_group(). This should not be called with
 * ext4_lock_group() held.
 */
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
				     ext4_group_t group, int cr)
{
	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
	struct super_block *sb = ac->ac_sb;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
	ext4_grpblk_t free;
	int ret = 0;

	if (should_lock)
		ext4_lock_group(sb, group);
	free = grp->bb_free;
	if (free == 0)
		goto out;
	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
		goto out;
	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
		goto out;
	if (should_lock)
		ext4_unlock_group(sb, group);

    ...

	if (should_lock)
		ext4_lock_group(sb, group);
	ret = ext4_mb_good_group(ac, group, cr);
out:
	if (should_lock)
		ext4_unlock_group(sb, group);
	return ret;
}

做一些基本判定后调用ext4_mb_good_group函数。

ext4_mb_good_group函数

检查再cr严苛程度下能否完成分配


/*
 * This is also called BEFORE we load the buddy bitmap.
 * Returns either 1 or 0 indicating that the group is either suitable
 * for the allocation or not.
 */
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
				ext4_group_t group, int cr)
{
	ext4_grpblk_t free, fragments;
	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

	BUG_ON(cr < 0 || cr >= 4);

	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
		return false;

	free = grp->bb_free;
	if (free == 0)
		return false;

	fragments = grp->bb_fragments;
	if (fragments == 0)
		return false;

	switch (cr) {
	case 0:
		BUG_ON(ac->ac_2order == 0);

		/* Avoid using the first bg of a flexgroup for data files */
        //lex块组的第一个块组一般是给目录和特殊文件用的，当“最严苛的时候”跳过
		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
		    ((group % flex_size) == 0))
			return false;
        //空闲block总数都少于fe_len的话返回false
		if (free < ac->ac_g_ex.fe_len)
			return false;

        //大于13则可以分配
		if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
			return true;
        //最大的空闲order小于申请的order，返回false
		if (grp->bb_largest_free_order < ac->ac_2order)
			return false;

		return true;
	case 1:
        //空闲空间的平均长度大于等于申请的长度，可以进行分配
		if ((free / fragments) >= ac->ac_g_ex.fe_len)
			return true;
		break;
	case 2:
        //空闲的总block树大于申请量就可以申请
		if (free >= ac->ac_g_ex.fe_len)
			return true;
		break;
	case 3:
        //cr=3最不严格，意味只要能有空闲的任何空间满足条件就可以申请
		return true;
	default:
		BUG();
	}

	return false;
}

ext4_mb_complex_scan_group函数


/*
 * The routine scans the group and measures all found extents.
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
static noinline_for_stack
void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
					struct ext4_buddy *e4b)
{
	struct super_block *sb = ac->ac_sb;
	void *bitmap = e4b->bd_bitmap;
	struct ext4_free_extent ex;
	int i;
	int free;

	free = e4b->bd_info->bb_free;
	if (WARN_ON(free <= 0))
		return;
    //从第一个空闲的block开始搜索
	i = e4b->bd_info->bb_first_free;

	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
		i = mb_find_next_zero_bit(bitmap,
						EXT4_CLUSTERS_PER_GROUP(sb), i);
        ...

		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
        ...
		ex.fe_logical = 0xDEADC0DE; /* debug value */
        //检查是否合适
		ext4_mb_measure_extent(ac, &ex, e4b);

		i += ex.fe_len;
		free -= ex.fe_len;
	}

	ext4_mb_check_limits(ac, e4b, 1);
}

ext4_mb_measure_extent函数


/*
 * The routine checks whether found extent is good enough. If it is,
 * then the extent gets marked used and flag is set to the context
 * to stop scanning. Otherwise, the extent is compared with the
 * previous found extent and if new one is better, then it's stored
 * in the context. Later, the best found extent will be used, if
 * mballoc can't find good enough extent.
 *
 * FIXME: real allocation policy is to be designed yet!
 */
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
					struct ext4_free_extent *ex,
					struct ext4_buddy *e4b)
{
	struct ext4_free_extent *bex = &ac->ac_b_ex;
	struct ext4_free_extent *gex = &ac->ac_g_ex;


	ac->ac_found++;

	/*
	 * The special case - take what you catch first
	 */
	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
		*bex = *ex;
		ext4_mb_use_best_found(ac, e4b);
		return;
	}

	/*
	 * Let's check whether the chuck is good enough
	 */
    //如果空闲空间长度刚好跟要求分配的长度一致那么这个就是最佳的
	if (ex->fe_len == gex->fe_len) {
		*bex = *ex;
		ext4_mb_use_best_found(ac, e4b);
		return;
	}

	/*
	 * If this is first found extent, just store it in the context
	 */
	if (bex->fe_len == 0) {
		*bex = *ex;
		return;
	}

	/*
	 * If new found extent is better, store it in the context
	 */

	if (bex->fe_len < gex->fe_len) {
        //如果当前记录的bex不满足分配，那么只要找一个更大的空闲区域，就记录到bex中
		/* if the request isn't satisfied, any found extent
		 * larger than previous best one is better */
		if (ex->fe_len > bex->fe_len)
			*bex = *ex;
	} else if (ex->fe_len > gex->fe_len) {
        //如果已经发现了满足条件的空闲区域记录在bex中，那么就要找到最小能满gex->fe_len的
        //空闲区域即可，避免使用更大的连续空间造成碎片化
		/* if the request is satisfied, then we try to find
		 * an extent that still satisfy the request, but is
		 * smaller than previous one */
		if (ex->fe_len < bex->fe_len)
			*bex = *ex;
	}

	ext4_mb_check_limits(ac, e4b, 0);
}