int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
int node_id;
int ret;
/*
* Zone reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
* thrown out if the zone is overallocated. So we do not reclaim
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.
*/
if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&//都小于最小设定的值
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
if (zone->all_unreclaimable)//设定了标识不回收
return ZONE_RECLAIM_FULL;
/*`
* Do not scan if the allocation should not be delayed.
*/
if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))//__GFP_WAIT不能等待,而PF_MEMALLOC表示内核管理内存执行公务需要申请页的标识,不能等待,否则可能死锁
return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
node_id = zone_to_nid(zone);
if (node_state(node_id, N_CPU) && node_id != numa_node_id())//不属于该cpu范围,不是NUMA 不扫描
return ZONE_RECLAIM_NOSCAN;
if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))//其他进程在回收,如果没有就上锁,准备回收该zone上的页
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);//回收该zone上的页
zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);//释放回收锁
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
}
static long zone_pagecache_reclaimable(struct zone *zone)
{
long nr_pagecache_reclaimable;
long delta = 0;
/*
* If RECLAIM_SWAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
if (zone_reclaim_mode & RECLAIM_SWAP)//设置了回收模式,并且设置了RECLAIM_SWAP表示可以把页交换到磁盘上来回收页,则所有页都可以回收
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);//未映射的页 = lru - mapped ,负数则返回0
/* If we can't clean pages, remove dirty pages from consideration */
if (!(zone_reclaim_mode & RECLAIM_WRITE))
delta += zone_page_state(zone, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
</pre><p>-------------------------------------------------------------慢速分配:---------------------------------------------------------------</p><p></p><pre name="code" class="cpp">/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
* __GFP_FS is also cleared as it implies __GFP_IO.
*/
static inline gfp_t memalloc_noio_flags(gfp_t flags)
{
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
flags &= ~(__GFP_IO | __GFP_FS);// __GFP_IO 标识任何的IO操作 __GFP_FS 标识任何的文件系统调用
return flags;
}
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;//得到表明是否可以等待
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
if (!wait)//不能等待,则失败返回
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)//系统执行公务需要分配页,这是不能等太久的,否则系统会陷入短暂的死机
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
/*
* If compaction is deferred for high-order allocations, it is because
* sync compaction recently failed. In this is the case and the caller
* requested a movable allocation that does not heavily disrupt the
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
enum zone_type high_zoneidx,
enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)//从zonelist中的列表中选择最合适的zone进行交互
wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
{ int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/* * The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (atomic) {
/* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule. */
if (!(gfp_mask & __GFP_NOMEMALLOC)) alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
* comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (gfp_mask & __GFP_MEMALLOC)
alloc_flags |= ALLOC_NO_WATERMARKS; else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
alloc_flags |= ALLOC_NO_WATERMARKS;
else if (!in_interrupt() &&
((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS; }
#ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
struct page *page;
do {//再分配
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)//__GFP_NOFALL 设置了,则定时等待把数据写入设备中,空出页来
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));//死循环,直到分配到了页为止
return page;
}
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
int migratetype, bool sync_migration,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
if (!order)
return NULL;
if (compaction_deferred(preferred_zone, order)) {
*deferred_compaction = true;
return NULL;
}
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
struct page *page;
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
/*
* It's bad if compaction run occurs and fails.
* The most likely reason is that pages exist,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
/*
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
if (sync_migration)
defer_compaction(preferred_zone, order);
cond_resched();
}
return NULL;
}
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
}
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
return page;
}