重写-linux内存管理-伙伴分配器(三)内存回收

文章详细阐述了Linux内核中的内存回收机制,包括kswapd进程的初始化、异步回收的唤醒和过程,以及同步回收的__alloc_pages_direct_reclaim函数。内存管理涉及尝试释放页面、页面分配和内存碎片整理等复杂操作,确保系统在内存不足时能有效回收资源。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


申请分配页的时候,页分配器首先尝试使用低水线分配页。如果使用低水线分配失败,说明内存轻微不足,页分配器将会唤醒所有符合分配条件的内存节点的页回收线程,异步回收页,然后尝试使用最低水线分配页。如果分配失败,说明内存严重不足,页分配器将会进行内存碎片整理,整理后尝试使用最低水线分配页。如果分配失败,说明内存不够用了,需要回收一些暂时用不上的内存,页分配器将会直接回收页。后面才会oom等等。也就是说,存在内存异步回收和内存同步回收。

一、内存异步回收

每个内存节点有一个页回收线程,如果内存节点的所有内存区域的空闲页数小于高水线,页回收线程就会反复尝试回收页,以回收内存节点中的页。

1.1 异步回收的初始化

kswapd_init初始化:

static int __init kswapd_init(void)
{
	int nid;

	swap_setup();//根据系统的内存大小设置全局变量page_cluster
	for_each_node_state(nid, N_MEMORY)//遍历每一个有内存的内存节点
 		kswapd_run(nid);//创建内核线程kswapd
	return 0;
}

int kswapd_run(int nid)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kswapd)
		return 0;

	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);//创建内核进程kswapd%d
	if (IS_ERR(pgdat->kswapd)) {
		/* failure at boot is fatal */
		BUG_ON(system_state < SYSTEM_RUNNING);
		pr_err("Failed to start kswapd on node %d\n", nid);
		ret = PTR_ERR(pgdat->kswapd);
		pgdat->kswapd = NULL;
	}
	return ret;
}

系统初始化期间,调用kswapd_init,在每个内存节点上创建kswap进程,该进程是一个无限循环的函数,我们进去看看kswap这个函数是怎么跑的:

static int kswapd(void *p)
{
	unsigned int alloc_order, reclaim_order;
	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
	pg_data_t *pgdat = (pg_data_t*)p;
	struct task_struct *tsk = current;
	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

	if (!cpumask_empty(cpumask))
		set_cpus_allowed_ptr(tsk, cpumask);

	/*
	 * Tell the memory management that we're a "memory allocator",
	 * and that if we need more memory we should get access to it
	 * regardless (see "__alloc_pages()"). "kswapd" should
	 * never get caught in the normal page freeing logic.
	 *
	 * (Kswapd normally doesn't need memory anyway, but sometimes
	 * you need a small amount of memory in order to be able to
	 * page out something else, and this flag essentially protects
	 * us from recursively trying to free more memory as we're
	 * trying to free the first piece of memory in the first place).
	 */
	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
	set_freezable();

	WRITE_ONCE(pgdat->kswapd_order, 0);
	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
	//无限循环
	for ( ; ; ) {
		bool ret;

		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
							highest_zoneidx);

kswapd_try_sleep:
		//尝试进入睡眠状态,让出cpu,知道被唤醒
		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
					highest_zoneidx);

		/* Read the new order and highest_zoneidx */
		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);//修改申请的内存order
		highest_zoneidx = kswapd_highest_zoneidx(pgdat,	//可以扫描和回收最高的zone的idx
							highest_zoneidx);
		WRITE_ONCE(pgdat->kswapd_order, 0);
		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);

		ret = try_to_freeze();//尝试冻住这个进程,在系统进入suspend的时候调用
		if (kthread_should_stop())//如果current->flags表示可以停止
			break;	//退出死循环

		if (ret)//如果是冻住后唤醒
			continue;//从头开始运行,不需要balance_pgdat

		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
						alloc_order);
		//对节点进行回收,用于平衡节点的内存水位,返回回收到的内存的order
		reclaim_order = balance_pgdat(pgdat, alloc_order,
						highest_zoneidx);
		if (reclaim_order < alloc_order)//如果回收到的内存小于申请内存
			goto kswapd_try_sleep;
	}

	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

	return 0;
}

kswapd过程如下:

  1. 进入for这个无限循环,在循环里面,执行如下操作:
  2. 初始化alloc_order 和highest_zoneidx
  3. 调用函数kswapd_try_to_sleep进入睡眠,直到被唤醒。
  4. 更新alloc_order 和highest_zoneidx
  5. 调用函数try_to_freeze看看是不是需要冻住自己这个进程,如果不需要,往下走;
  6. 调用函数kthread_should_stop看看自己这个任务是不是需要停止,如果是,退出死循环
  7. 调用函数balance_pgdat对节点进行回收,用于平衡节点的内存水位,返回回收到的内存的order。

1.2 异步回收的唤醒

进程是在kswapd_try_to_sleep函数中睡眠的,也是在这个函数中唤醒的,我们想知道他是怎么唤醒的,需要看看这个函数:

static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
				unsigned int highest_zoneidx)
{
	long remaining = 0;
	DEFINE_WAIT(wait);

	if (freezing(current) || kthread_should_stop())
		return;

	//把wait加入到kswapd_wait等待队列中,并且设置为可中断状态
	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

	//为kswapd睡眠做准备,并且判断kswapd是否可以睡眠
	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
		reset_isolation_suitable(pgdat);//重置内存碎片整理的时候的一些标记

		//唤醒内存碎片整理的进程
		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);

		remaining = schedule_timeout(HZ/10);//尝试睡眠0.1s

		if (remaining) {//如果remaining不为0,说明被唤醒了
			//需要更新kswapd_highest_zoneidx和kswapd_order
			WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
					kswapd_highest_zoneidx(pgdat,
							highest_zoneidx));

			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
		}

		finish_wait(&pgdat->kswapd_wait, &wait);//kswapd_wait等待队列中移除wait
		//把wait加入到kswapd_wait等待队列中,并且设置为可中断状态
		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
	}

	//如果刚刚尝试的睡眠没有被唤醒,为kswapd睡眠做准备,并且判断kswapd是否可以睡眠
	if (!remaining &&	
	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

		//调用calculate_normal_threshold计算内存压力,并且写入pcpu的threshold
		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

		if (!kthread_should_stop())//如果任务应该停止
			schedule();//调度出去

		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
	} else {
		if (remaining)//如果刚刚尝试的睡眠被唤醒了
			//KSWAPD处于低水位事件加一
			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
		else//否则就是kswapd睡眠没准备好
			//KSWAPD处于高水位事件加一
			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
	}
	finish_wait(&pgdat->kswapd_wait, &wait);//kswapd_wait等待队列中移除wait
}

kswapd_try_to_sleep过程如下:

  1. 调用函数prepare_to_wait把wait加入到kswapd_wait等待队列中,并且设置为可中断状态
  2. 调用函数prepare_kswapd_sleep为kswapd睡眠做准备,并且判断kswapd是否可以睡眠,如果可以睡眠,去到3;如果不可以睡眠去到5
  3. 调用函数reset_isolation_suitable重置内存碎片整理的时候的一些标记,调用函数wakeup_kcompactd唤醒内存碎片整理的进程,
  4. 调用函数schedule_timeout尝试睡眠0.1s,如果中途被唤醒了,需要更新kswapd_highest_zoneidx和kswapd_order,更新kswapd_wait队列
  5. 如果刚刚尝试的睡眠没有被唤醒,为kswapd睡眠做准备,并且判断kswapd是否可以睡眠,调用函数set_pgdat_percpu_threshold计算内存压力,并且写入pcpu的threshold,调用函数scheduled睡眠。
  6. 等待唤醒,唤醒则在这里继续,调用函数finish_wait把kswapd_wait等待队列中移除wait。

看到这里我们知道只要唤醒kswapd_wait这个等待队列,我们的这个任务就会被唤醒继续运行。

1.3 异步回收过程分析

我们知道是kswapd通过调用函数balance_pgdat对节点进行回收,balance_pgdat调用函数kswapd_shrink_node,kswapd_shrink_node调用函数shrink_node函数进行回收的。这里的代码量比较多,很多都看不懂,而最主要的shrink_node放在后面讲。

二、内存同步回收

直接回收页针对备用区域列表中符合分配条件的每个内存区域,在慢速路径中是调用函数__alloc_pages_direct_reclaim来回收内存区域所属的内存节点中的页。__alloc_pages_direct_reclaim:

static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
		unsigned int alloc_flags, const struct alloc_context *ac,
		unsigned long *did_some_progress)
{
	struct page *page = NULL;
	bool drained = false;

	//直接同步页面回收 
	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
	if (unlikely(!(*did_some_progress)))
		return NULL;

retry:
	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);//进行页面分配操作

	/*
	 * If an allocation failed after direct reclaim, it could be because
	 * pages are pinned on the per-cpu lists or in high alloc reserves.
	 * Shrink them and try again
	 */
	if (!page && !drained) {//分配失败并且还没有重试
		//把高阶原子类型的页块转换成申请的迁移类型
		unreserve_highatomic_pageblock(ac, false);
		drain_all_pages(NULL);//把pcp的物理页放回伙伴系统
		drained = true;
		goto retry;//再试一次
	}

	return page;
}

__alloc_pages_direct_reclaim过程如下:

  1. 调用函数__perform_reclaim进行直接同步页面回收,
  2. 然后调用函数get_page_from_freelist进行页面分配操作
  3. 如果分配失败并且还没有重试,调用函数unreserve_highatomic_pageblock把高阶原子类型的页块转换成申请的迁移类型,调用函数drain_all_pages把pcp的物理页放回伙伴系统,回到2再试一次。

__perform_reclaim:

static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
					const struct alloc_context *ac)
{
	unsigned int noreclaim_flag;
	unsigned long pflags, progress;

	cond_resched();//主动让出cpu

	/* We now go into synchronous reclaim */
	cpuset_memory_pressure_bump();
	psi_memstall_enter(&pflags);//空函数
	fs_reclaim_acquire(gfp_mask);//空函数
	//设置current的flags为PF_MEMALLOC,表示在内存碎片整理进行中
	noreclaim_flag = memalloc_noreclaim_save();

	//尝试释放页面
	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
								ac->nodemask);

	memalloc_noreclaim_restore(noreclaim_flag);//恢复current的flags,表示compact结束
	fs_reclaim_release(gfp_mask);//空函数
	psi_memstall_leave(&pflags);//空函数

	cond_resched();//主动让出cpu

	return progress;
}

__perform_reclaim过程如下:

  1. 调用函数memalloc_noreclaim_save设置current的flags为PF_MEMALLOC,表示在正在回收内存
  2. 调用函数try_to_free_pages尝试释放页面,
  3. 调用函数memalloc_noreclaim_restore恢复current的flags,表示回收内存完毕

我们继续看看try_to_free_pages函数是怎么释放内存的:

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
				gfp_t gfp_mask, nodemask_t *nodemask)
{
	unsigned long nr_reclaimed;
	struct scan_control sc = {//初始化扫描控制器
		.nr_to_reclaim = SWAP_CLUSTER_MAX,
		.gfp_mask = current_gfp_context(gfp_mask),
		.reclaim_idx = gfp_zone(gfp_mask),
		.order = order,
		.nodemask = nodemask,
		.priority = DEF_PRIORITY,
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = 1,
	};

	/*
	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
	 * Confirm they are large enough for max values.
	 */
	BUILD_BUG_ON(MAX_ORDER > S8_MAX);
	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);

	//如果在节流期间传递了致命信号,不应该进行页面直接回收,而是kswap
	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
		return 1;

	set_task_reclaim_state(current, &sc.reclaim_state);//设置current状态为正在回收内存
	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);

	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);//真正的尝试释放页面

	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
	set_task_reclaim_state(current, NULL);//恢复current状态

	return nr_reclaimed;
}

try_to_free_pages过程如下:

  1. 静态初始化初始化扫描控制器sc
  2. 如果在节流期间传递了致命信号,不应该进行页面直接回收,而是kswap,返回1
  3. 调用函数set_task_reclaim_state设置current的flags为PF_MEMALLOC,表示在正在回收内存
  4. 调用函数do_try_to_free_pages真正的尝试释放页面
  5. 调用函数set_task_reclaim_state恢复current的flags,表示回收内存完毕

do_try_to_free_pages函数是真正的释放内存怒页面,我们一起看看:

static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
					  struct scan_control *sc)
{
	int initial_priority = sc->priority;
	pg_data_t *last_pgdat;
	struct zoneref *z;
	struct zone *zone;
retry:
	delayacct_freepages_start();

	if (!cgroup_reclaim(sc))//如果sc没有达到cgroup的极限
		//计数加一,什么计数,我也不知道
		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

	do {
		//通过回收优先级计算内存压力
		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
				sc->priority);
		sc->nr_scanned = 0;//初始化扫描计数
		shrink_zones(zonelist, sc);//从zone中进行直接回收

		//如果回收到的页面大于需要的页面
		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
			break;//退出循环

		if (sc->compaction_ready)//如果内存碎片整理准备好了
			break;//退出循环

		if (sc->priority < DEF_PRIORITY - 2)//如果回收页面很小了
			sc->may_writepage = 1;//设置允许回写
	} while (--sc->priority >= 0);

	last_pgdat = NULL;
	//扫描zonelist里面的每一个区域,
	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
					sc->nodemask) {
		if (zone->zone_pgdat == last_pgdat)
			continue;
		last_pgdat = zone->zone_pgdat;

		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);

		if (cgroup_reclaim(sc)) {
			struct lruvec *lruvec;
			//获取一个memcg的lru列表向量
			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
						   zone->zone_pgdat);
			clear_bit(LRUVEC_CONGESTED, &lruvec->flags);//清除lru链表的脏标志位
		}
	}

	delayacct_freepages_end();//完成时间戳和计数器的统计

	if (sc->nr_reclaimed)//如果直接回收页
		return sc->nr_reclaimed;//返回直接回收到的页数量

	//回收不到页面,但是内存碎片整理准备好了
	if (sc->compaction_ready)
		return 1;//尝试压缩

	/*
	 * We make inactive:active ratio decisions based on the node's
	 * composition of memory, but a restrictive reclaim_idx or a
	 * memory.low cgroup setting can exempt large amounts of
	 * memory from reclaim. Neither of which are very common, so
	 * instead of doing costly eligibility calculations of the
	 * entire cgroup subtree up front, we assume the estimates are
	 * good, and retry with forcible deactivation if that fails.
	 */
	if (sc->skipped_deactivate) {
		sc->priority = initial_priority;
		sc->force_deactivate = 1;
		sc->skipped_deactivate = 0;
		goto retry;
	}

	/* Untapped cgroup reserves?  Don't OOM, retry. */
	if (sc->memcg_low_skipped) {
		sc->priority = initial_priority;
		sc->force_deactivate = 0;
		sc->memcg_low_reclaim = 1;
		sc->memcg_low_skipped = 0;
		goto retry;
	}

	return 0;
}

do_try_to_free_pages过程如下:

  1. 如果sc没有达到cgroup的极限,计数加一
  2. 从sc->priority开始,每次循环priority减一,在循环体中执行:
  3. 调用函数vmpressure_prio通过回收优先级计算内存压力
  4. 调用函数shrink_zones从zone中进行直接回收
  5. 如果回收到的页面大于需要的页面或者内存碎片整理准备好了,退出循环
  6. 如果回收页面的优先级很小了,表示很难回收到内存了,设置允许回写标志位
  7. 调用函数for_each_zone_zonelist_nodemask遍历每一个zone,清除lru链表的脏标志位
  8. 调用函数delayacct_freepages_end完成时间戳和计数器的统计
  9. 如果直接回收到页,返回直接回收到的页数量
  10. 如果回收不到页面,但是内存碎片整理准备好了,返回1表示尝试压缩

shrink_zones是直接回收的核心函数:

static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
	struct zoneref *z;
	struct zone *zone;
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;
	gfp_t orig_mask;
	pg_data_t *last_pgdat = NULL;

	/*
	 * If the number of buffer_heads in the machine exceeds the maximum
	 * allowed level, force direct reclaim to scan the highmem zone as
	 * highmem pages could be pinning lowmem pages storing buffer_heads
	 */
	orig_mask = sc->gfp_mask;
	if (buffer_heads_over_limit) {
		sc->gfp_mask |= __GFP_HIGHMEM;
		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
	}
	//遍历zonelist的每一个zone
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
					sc->reclaim_idx, sc->nodemask) {
		/*
		 * Take care memory controller reclaiming has small influence
		 * to global LRU.
		 */
		if (!cgroup_reclaim(sc)) {//如果sc没有达到cgroup的极限
			if (!cpuset_zone_allowed(zone,
						 GFP_KERNEL | __GFP_HARDWALL))
				continue;

			/*
			 * If we already have plenty of memory free for
			 * compaction in this zone, don't free any more.
			 * Even though compaction is invoked for any
			 * non-zero order, only frequent costly order
			 * reclamation is disruptive enough to become a
			 * noticeable problem, like transparent huge
			 * page allocations.
			 */
			if (IS_ENABLED(CONFIG_COMPACTION) &&	//如果可以内存碎片整理
			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&	//高阶内存的回收
			    compaction_ready(zone, sc)) {	//内存碎片整理工作已经准备好了
				sc->compaction_ready = true;
				continue;//已经有足够的空闲内存用于压缩,就不要再释放了。
			}

			/*
			 * Shrink each node in the zonelist once. If the
			 * zonelist is ordered by zone (not the default) then a
			 * node may be shrunk multiple times but in that case
			 * the user prefers lower zones being preserved.
			 */
			if (zone->zone_pgdat == last_pgdat)//如果是最后一个zone_pgdat
				continue;//一般都是重要的zone,不要回收了

			/*
			 * This steals pages from memory cgroups over softlimit
			 * and returns the number of reclaimed pages and
			 * scanned pages. This works for global memory pressure
			 * and balancing, not for a memcg's limit.
			 */
			nr_soft_scanned = 0;
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
						sc->order, sc->gfp_mask,
						&nr_soft_scanned);
			sc->nr_reclaimed += nr_soft_reclaimed;
			sc->nr_scanned += nr_soft_scanned;
			/* need some check for avoid more shrink_zone() */
		}

		/* See comment about same check for global reclaim above */
		if (zone->zone_pgdat == last_pgdat)
			continue;
		last_pgdat = zone->zone_pgdat;
		shrink_node(zone->zone_pgdat, sc);//回收内存节点中的页
	}

	/*
	 * Restore to original mask to avoid the impact on the caller if we
	 * promoted it to __GFP_HIGHMEM.
	 */
	sc->gfp_mask = orig_mask;
}

shrink_zones的作用只有一个,就是遍历每一个zone,调用shrink_node进行回收内存节点中的页。

三、shrink_node

好零零散散的,不知道怎么写,涉及到内容好多,包括回收lru链表不活动页,回收slab内存,swap换入换出,lru不活动页转化为lru活动页等等,先不写。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小坚学Linux

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值