之所以想写这篇文章,是因为一个问题:
如果我使用slab的时候向内核申请了一个不可移动页,但是此时内核没有不可移动页了,只剩下movable的页,那么,根据伙伴系统的设计,会从movable页里面取出来一个用于分配(见__rmqueue_fallback函数),那么,根据内存规整子系统设计的原则,可移动页是最适合做规整的,如果这个slab使用的页面被规整程序移动了,那slab系统不就乱套了吗?此外,书中好像并没有看到说不可移动页是怎么做规整的,那么,规整子系统真的移动了不可移动页吗,如果为真,那这个页是怎么做移动的?
内存高手请忽略我这个菜鸟的问题哈,上面这个问题确实问的很没有水平。不过苦于书中找不到答案,于是乎就去看看代码。
本文代码基于linux 4.19.195。
在linux v4.6-rc1,内核大神把规整的后台工作从kswapd移到了kcompactd,见
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=698b1b30642f1ff0ea10ef1de9745ab633031377
本文主要分析一下kcompactd线程以及回答一下上面的问题。
/*
* The background compaction daemon, started as a kernel thread
* from the init process.
*/
static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
set_freezable();
pgdat->kcompactd_max_order = 0;
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
wait_event_freezable(pgdat->kcompactd_wait,
kcompactd_work_requested(pgdat));
kcompactd_do_work(pgdat);
}
return 0;
}
kcompactd的工作主要由kcompactd_do_work函数完成。注意,这个线程把kcompactd_max_order设置为0,把kcompactd_classzone_idx设置为pgdat->nr_zones - 1。
显然,每个内存节点有一个kcompactd线程。
static void kcompactd_do_work(pg_data_t *pgdat)
{
/*
* With no special task, compact all zones so that a page of requested
* order is allocatable.
*/
int zoneid;
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT, //居然是轻量级同步
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL, //这个好奇怪
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
count_compact_event(KCOMPACTD_WAKE);
for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_deferred(zone, cc.order))
continue;
if (compaction_suitable(zone, cc.order, 0, zoneid) !=
COMPACT_CONTINUE)
continue;
if (kthread_should_stop())
return;
cc.zone = zone;
status = compact_zone(zone, &cc);
if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
/*
* Buddy pages may become stranded on pcps that could
* otherwise coalesce on the zone's free area for
* order >= cc.order. This is ratelimited by the
* upcoming deferral.
*/
drain_all_pages(zone);
/*
* We use sync migration mode here, so we defer like
* sync direct compaction does.
*/
defer_compaction(zone, cc.order);
}
count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
cc.total_migrate_scanned);
count_compact_events(KCOMPACTD_FREE_SCANNED,
cc.total_free_scanned);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
}
/*
* Regardless of success, we are done until woken up next. But remember
* the requested order/classzone_idx in case it was higher/tighter than
* our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
}
struct compact_control cc是内核一贯操作,把控制参数都包到这个结构体里面,一开始让我非常奇怪的是,cc里面的模式设置为MIGRATE_SYNC_LIGHT,此外,gfp_mask设置为GFP_KERNEL,为什么奇怪,因为后面一些判断用了这两个成员,直接把我搞混了,后面会提到。
从上面代码来看,主框架就是一个for循环,先从zone_normal开始扫描,到zone_high结束(假设只有zone_normal和zone_high,大部分机器应该是从zone_dma开始);循环内部,调用compaction_suitable判断一下是否合适规整,如果能够直接分配相应order的页,那其实不用做规整,如果空闲页太少了,那没法做规整,如果是echo 1 > /proc/sys/vm/compact_memory手动触发的,那必须做规整。这三点,都体现在__compaction_suitable函数里面了。适合做规整的zone,会调用函数compact_zone。
static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
{
enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const bool sync = cc->mode != MIGRATE_ASYNC;
/*
* These counters track activities during zone compaction. Initialize
* them before compacting a new zone.
*/
cc->total_migrate_scanned = 0;
cc->total_free_scanned = 0;
cc->nr_migratepages = 0;
cc->nr_freepages = 0;
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
/* huh, compaction_suitable is returning something unexpected */
VM_BUG_ON(ret != COMPACT_CONTINUE);
/*
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start (unless we explicitly
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
if (cc->whole_zone) { //表示扫描整个zone
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
if (cc->migrate_pfn == start_pfn)
cc->whole_zone = true;
}
cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { //用于判断内存规整是否可结束
int err;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
* previous cc->order aligned block.
*/
goto check_drain;
case ISOLATE_SUCCESS:
;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_CONTENDED;
goto out;
}
/*
* We failed to migrate at least one page in the current
* order-aligned block, so skip the rest of it.
*/
if (cc->direct_compaction &&
(cc->mode == MIGRATE_ASYNC)) {
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
cc->last_migrated_pfn = 0;
}
}
check_drain:
/*
* Has the migration scanner moved away from the previous
* cc->order aligned block where we migrated from? If yes,
* flush the pages that were freed, so that they can merge and
* compact_finished() can detect immediately if allocation
* would succeed.
*/
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
cc->last_migrated_pfn = 0;
}
}
}
out:
/*
* Release free pages and update where the free scanner should restart,
* so we don't leave any returned pages behind in the next attempt.
*/
if (cc->nr_freepages > 0) {
unsigned long free_pfn = release_freepages(&cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
if (free_pfn > zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
return ret;
}
函数很长,分段讲。
首先,函数进来,通过gfpflags_to_migratetype从cc->gfp_mask获取移动类型,还记得上面说的吗,这里获得的移动类型是unmovable,接下来,进入compaction_suitable函数判断此次规整是否适合,这个函数很有意思,下面分析。如果适合的话,就继续往下走,设置好单次扫描的起始点和终点,进入一个while循环,循环的退出条件由函数compact_finished判断。循环体里面,会调用isolate_migratepages函数完成相关规整候选page的隔离操作,隔离完成后,调用migrate_pages函数完成迁移动作。剩下的就是一下更新统计信息的代码。主要步骤就三个:判断是否适合规整->隔离页面->迁移页面。
咦,外面不是刚做了一次compaction_suitable的判断吗,怎么进到compact_zone又做一次,这里,我认为,是因为无锁编程,所以进来再判断一次,如果刚好进来了这次判断之前,有其他路径的内存规整完成了,这里就不用再做一次了。
下面来看isolate_migratepages函数,该函数完成页面的隔离,隔离的页从zone所管理的页找起,切成一段一段进行寻找。
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
* Start at where we last stopped, or beginning of the zone as
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
* need to schedule, or even abort async compaction.
*/
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;
/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
continue;
/*
* For async compaction, also only scan in MOVABLE blocks.
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
if (!suitable_migration_source(cc, page))
continue;
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
/*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
*/
break;
}
/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
这个函数逻辑是这样的:一个个pageblock进行扫描(见for循环条件block_end_pfn += pageblock_nr_pages),然后通过函数pageblock_pfn_to_page返回这个页块中第一个物理页的page结构,调用函数suitable_migration_source判断这个页块是否适合迁移,合适的话,就调用isolate_migratepages_block函数进行隔离。
我们先来看看suitable_migration_source函数。这里要与上面几个cc设置的初始值相呼应了。
static bool suitable_migration_source(struct compact_control *cc,
struct page *page)
{
int block_mt;
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
block_mt = get_pageblock_migratetype(page);
if (cc->migratetype == MIGRATE_MOVABLE)
return is_migrate_movable(block_mt);
else
return block_mt == cc->migratetype;
}
一眼看下去,get_pageblock_migratetype获取页的迁移类型,如果该页的迁移类型和之前规整的控制参数cc里的GFP_KERNEL所要求的是同一种迁移类型,则适合做迁移,那也就是说,kcompactd只对unmovable属性的页块做迁移?
仔细看,这个函数进来的第一个判断,kcompactd线程设置的迁移模式是MIGRATE_SYNC_LIGHT,所以,会在这里就返回true了,也就是说,任何迁移类型的页块都能做规整操作。
第一次看到这里,先不说分析老半天,光从结论上来看,就是一脸懵逼了。
先懵逼着吧,要不懵逼,只能把所有流程串在一起,就清晰了。
接下来看isolate_migratepages_block函数,该函数对页块执行隔离操作。(我理解,这个隔离操作就是提升效率的关键,把页隔离出来,这样我就不用长期的持有锁了,只有在隔离的过程以及还原的过程需要持锁,其他时候完全可以放心的释放锁,去做该做的操作)。
/**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
* @isolate_mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
* Returns zero if there is a fatal signal pending, otherwise PFN of the
* first page that was not scanned (which may be both less, equal to or more
* than end_pfn).
*
* The pages are isolated on cc->migratepages list (not required to be empty),
* and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
* is neither read nor updated.
*/
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
return 0;
}
if (compact_should_abort(cc))
return 0;
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
if (skip_on_failure && low_pfn >= next_skip_pfn) {
/*
* We have isolated all migration candidates in the
* previous order-aligned block, and did not skip it due
* to failure. We should migrate the pages now and
* hopefully succeed compaction.
*/
if (nr_isolated)
break;
/*
* We failed to isolate in the previous order-aligned
* block. Set the new boundary to the end of the
* current block. Note we can't simply increase
* next_skip_pfn by 1 << order, as low_pfn might have
* been incremented by a higher number due to skipping
* a compound or a high-order buddy page in the
* previous loop iteration.
*/
next_skip_pfn = block_end_pfn(low_pfn, cc->order);
}
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc))
break;
if (!pfn_valid_within(low_pfn))
goto isolate_fail;
nr_scanned++;
page = pfn_to_page(low_pfn);
if (!valid_page)
valid_page = page;
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
* the worst thing that can happen is that we skip some
* potential isolation targets.
*/
if (PageBuddy(page)) {
unsigned long freepage_order = page_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
if (freepage_order > 0 && freepage_order < MAX_ORDER)
low_pfn += (1UL << freepage_order) - 1;
continue;
}
/*
* Regardless of being on LRU, compound pages such as THP and
* hugetlbfs are not to be compacted. We can potentially save
* a lot of iterations if we skip them at once. The check is
* racy, but we can consider only valid values and the only
* danger is skipping too much.
*/
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
if (likely(order < MAX_ORDER))
low_pfn += (1UL << order) - 1;
goto isolate_fail;
}
/*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU and non-lru movable pages.
* Skip any other type of page
*/
if (!PageLRU(page)) {
/*
* __PageMovable can return false positive so we need
* to verify it under page_lock.
*/
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
locked = false;
}
if (!isolate_movable_page(page, isolate_mode))
goto isolate_success;
}
goto isolate_fail;
}
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
goto isolate_fail;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
goto isolate_fail;
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc);
if (!locked)
break;
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
goto isolate_fail;
/*
* Page become compound since the non-locked check,
* and it's on LRU. It can only be a THP so the order
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
low_pfn += (1UL << compound_order(page)) - 1;
goto isolate_fail;
}
}
lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
goto isolate_fail;
VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
inc_node_page_state(page,
NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
cc->nr_migratepages++;
nr_isolated++;
/*
* Record where we could have freed pages by migration and not
* yet flushed them to buddy allocator.
* - this is the lowest page that was isolated and likely be
* then freed by migration.
*/
if (!cc->last_migrated_pfn)
cc->last_migrated_pfn = low_pfn;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
continue;
isolate_fail:
if (!skip_on_failure)
continue;
/*
* We have isolated some pages, but then failed. Release them
* instead of migrating, as we cannot form the cc->order buddy
* page anyway.
*/
if (nr_isolated) {
if (locked) {
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
if (low_pfn < next_skip_pfn) {
low_pfn = next_skip_pfn - 1;
/*
* The check near the loop beginning would have updated
* next_skip_pfn too, but this is a bit simpler.
*/
next_skip_pfn += 1UL << cc->order;
}
}
/*
* The PageBuddy() check could have potentially brought us outside
* the range to be scanned.
*/
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
if (locked)
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/*
* Update the pageblock-skip information and cached scanner pfn,
* if the whole pageblock was scanned without isolating any page.
*/
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
return low_pfn;
}
已经贴了好几个这种特别长的函数了,只讲重点了,不然我自己都看不下去了。
这个函数,字面意思是隔离页,实际上,还做了很多判断,从而确定某个页是否适合被隔离出来,不适合的话,自己不隔离了,规整也就不会碰到了~~咦,好像和一开始那个问题非常相关了。
看函数主体,一个for循环,从页块头遍历每一个页,直到页块尾,判断页是否适合隔离出来做规整。
首先,if (PageBuddy(page))的结果一定是continue,buddy里的页跳过;
接下来,if (PageCompound(page))的结果一定是goto isolate_fail;看来也不适合做规整;
继续,if (!PageLRU(page))的页,只有if (unlikely(__PageMovable(page)) &&!PageIsolated(page))判断成功,才会尝试isolate_movable_page,也就是说,非lru上的page,只有是__PageMovable判断成功的且没有隔离的,才会隔离,嗯,原来这种类型的页是zsmalloc这类页,后面再学;
最后剩下的就是lru上的page了,还做了一点判断(注意要lock住page之后再判断PageLRU(page)才是并发安全的),细节大家可以去看,总之,基本上,只有lru上的页,才能做规整。----感觉离一开始的问题答案很接近了。
接下来,就是对这些隔离出来的页做迁移操作,迁移操作完成,规整就完成了。
回到最开始的问题。既然规整的页基本都是lru上的页,那么,lru上都有些什么页呢?当然是文件页和匿名页呀。匿名页在pte_fault_handle函数里分配的都是highuser类型的,都是可移动的,这种页拿来规整最合理不过了,迁移的方案,自然就是解除页表映射,做page内容的copy,然后再重建页表映射这么简单啦。那么,文件页呢?首先,还得确认一下文件页的迁移类型。。。
pagecache是由函数find_get_page获取或者分配的,我们来看看。
static inline struct page *find_get_page_flags(struct address_space *mapping,
pgoff_t offset, int fgp_flags)
{
return pagecache_get_page(mapping, offset, fgp_flags, 0);
}
/**
* pagecache_get_page - find and get a page reference
* @mapping: the address_space to search
* @offset: the page index
* @fgp_flags: PCG flags
* @gfp_mask: gfp mask to use for the page cache data page allocation
*
* Looks up the page cache slot at @mapping & @offset.
*
* PCG flags modify how the page is returned.
*
* @fgp_flags can be:
*
* - FGP_ACCESSED: the page will be marked accessed
* - FGP_LOCK: Page is return locked
* - FGP_CREAT: If page is not present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU
* list. The page is returned locked and with an increased
* refcount. Otherwise, NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
repeat:
page = find_get_entry(mapping, offset);
if (radix_tree_exceptional_entry(page))
page = NULL;
if (!page)
goto no_page;
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
if (!trylock_page(page)) {
put_page(page);
return NULL;
}
} else {
lock_page(page);
}
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(page->index != offset, page);
}
if (page && (fgp_flags & FGP_ACCESSED))
mark_page_accessed(page);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
if (unlikely(err)) {
put_page(page);
page = NULL;
if (err == -EEXIST)
goto repeat;
}
}
return page;
}
EXPORT_SYMBOL(pagecache_get_page);
太复杂了,看重点,no_page标签下面,就是页的分配,page = __page_cache_alloc(gfp_mask);那么这个gfp_mask是哪来的?find_get_page函数里指定了gfp_mask是0,0又是什么鬼标签。。。好吧下篇博文就研究这个为0的gfp_mask了。不过,既然为0,__GFP_MOVABLE这个标志肯定没置,__GFP_RECLAIM也没置位,那大概率是unmovable的了。
嗯,迁移unmovable的page,怎么做到的?那只能看migrate_pages函数了。这个函数主要调用unmap_and_move函数。
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
static ICE_noinline int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage;
if (!thp_migration_supported() && PageTransHuge(page))
return -ENOMEM;
newpage = get_new_page(page, private);
if (!newpage)
return -ENOMEM;
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
ClearPageActive(page);
ClearPageUnevictable(page);
if (unlikely(__PageMovable(page))) {
lock_page(page);
if (!PageMovable(page))
__ClearPageIsolated(page);
unlock_page(page);
}
if (put_new_page)
put_new_page(newpage, private);
else
put_page(newpage);
goto out;
}
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
out:
if (rc != -EAGAIN) {
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
* migrated will have kepts its references and be
* restored.
*/
list_del(&page->lru);
/*
* Compaction can migrate also non-LRU pages which are
* not accounted to NR_ISOLATED_*. They can be recognized
* as __PageMovable
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
page_is_file_cache(page), -hpage_nr_pages(page));
}
/*
* If migration is successful, releases reference grabbed during
* isolation. Otherwise, restore the page to right list unless
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
put_page(page);
if (reason == MR_MEMORY_FAILURE) {
/*
* Set PG_HWPoison on just freed page
* intentionally. Although it's rather weird,
* it's how HWPoison flag works at the moment.
*/
if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
}
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
putback_lru_page(page);
goto put_new;
}
lock_page(page);
if (PageMovable(page))
putback_movable_page(page);
else
__ClearPageIsolated(page);
unlock_page(page);
put_page(page);
}
put_new:
if (put_new_page)
put_new_page(newpage, private);
else
put_page(newpage);
}
return rc;
}
该先调用get_new_page分配一个新页面,也就是迁移的目的地。然后调用__unmap_and_move函数做实际的迁移操作。__unmap_and_move函数里,我们只看和文件页相关的操作,实际调用move_to_new_page做迁移。
/*
* Move a page to a newly allocated page
* The page is locked and all ptes have been successfully removed.
*
* The new page will have replaced the old page if this function
* is successful.
*
* Return value:
* < 0 - error code
* MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
struct address_space *mapping;
int rc = -EAGAIN;
bool is_lru = !__PageMovable(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
mapping = page_mapping(page);
if (likely(is_lru)) { //这里判断成功
if (!mapping) //文件页mapping不为空
rc = migrate_page(mapping, newpage, page, mode);
else if (mapping->a_ops->migratepage)
/*
* Most pages have a mapping and most filesystems
* provide a migratepage callback. Anonymous pages
* are part of swap space which also has its own
* migratepage callback. This is the most common path
* for page migration.
*/
rc = mapping->a_ops->migratepage(mapping, newpage,
page, mode);
else
rc = fallback_migrate_page(mapping, newpage,
page, mode);
} else {
/*
* In case of non-lru page, it could be released after
* isolation step. In that case, we shouldn't try migration.
*/
VM_BUG_ON_PAGE(!PageIsolated(page), page);
if (!PageMovable(page)) {
rc = MIGRATEPAGE_SUCCESS;
__ClearPageIsolated(page);
goto out;
}
rc = mapping->a_ops->migratepage(mapping, newpage,
page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
!PageIsolated(page));
}
/*
* When successful, old pagecache page->mapping must be cleared before
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
if (__PageMovable(page)) {
VM_BUG_ON_PAGE(!PageIsolated(page), page);
/*
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
__ClearPageIsolated(page);
}
/*
* Anonymous and movable page->mapping will be cleard by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
if (!PageMappingFlags(page))
page->mapping = NULL;
if (unlikely(is_zone_device_page(newpage))) {
if (is_device_public_page(newpage))
flush_dcache_page(newpage);
} else
flush_dcache_page(newpage);
}
out:
return rc;
}
简单分析后知道,文件页走的是fallback_migrate_page流程,或者如果文件系统实现了mapping->a_ops->migratepage,就走mapping->a_ops->migratepage。接下来看fallback_migrate_page函数把,通用流程。
/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
if (PageDirty(page)) {
/* Only writeback pages in full synchronous migration */
switch (mode) {
case MIGRATE_SYNC:
case MIGRATE_SYNC_NO_COPY:
break;
default:
return -EBUSY;
}
return writeout(mapping, page);
}
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
return migrate_page(mapping, newpage, page, mode);
}
从函数注释,就知道这个函数是干啥的了。
继续往下探索,就是migrate_page_move_mapping函数了
/*
* Replace the page in the mapping.
*
* The number of remaining references must be:
* 1 for anonymous pages without a mapping
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode,
int extra_count)
{
struct zone *oldzone, *newzone;
int dirty;
int expected_count = 1 + extra_count;
void **pslot;
/*
* Device public or private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
return -EAGAIN;
/* No turning back from here */
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
__SetPageSwapBacked(newpage);
return MIGRATEPAGE_SUCCESS;
}
oldzone = page_zone(page);
newzone = page_zone(newpage);
xa_lock_irq(&mapping->i_pages);
pslot = radix_tree_lookup_slot(&mapping->i_pages,
page_index(page));
expected_count += hpage_nr_pages(page) + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot,
&mapping->i_pages.xa_lock) != page) {
xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
/*
* In the async migration case of moving a page with buffers, lock the
* buffers using trylock before the mapping is moved. If the mapping
* was moved, we later failed to lock the buffers and could not move
* the mapping back due to an elevated page count, we would have to
* block waiting on other references to be dropped.
*/
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
page_ref_unfreeze(page, expected_count);
xa_unlock_irq(&mapping->i_pages);
return -EAGAIN;
}
/*
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
} else {
VM_BUG_ON_PAGE(PageSwapCache(page), page);
}
/* Move dirty while page refs frozen and newpage not yet exposed */
dirty = PageDirty(page);
if (dirty) {
ClearPageDirty(page);
SetPageDirty(newpage);
}
radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
if (PageTransHuge(page)) {
int i;
int index = page_index(page);
for (i = 1; i < HPAGE_PMD_NR; i++) {
pslot = radix_tree_lookup_slot(&mapping->i_pages,
index + i);
radix_tree_replace_slot(&mapping->i_pages, pslot,
newpage + i);
}
}
/*
* Drop cache reference from old page by unfreezing
* to one less reference.
* We know this isn't the last reference.
*/
page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
xa_unlock(&mapping->i_pages);
/* Leave irq disabled to prevent preemption while updating stats */
/*
* If moved to a different zone then also account
* the page for that zone. Other VM counters will be
* taken care of when we establish references to the
* new page and drop references to the old page.
*
* Note that anonymous pages are accounted for
* via NR_FILE_PAGES and NR_ANON_MAPPED if they
* are mapped to swap space.
*/
if (newzone != oldzone) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
__inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
__dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
__inc_node_state(newzone->zone_pgdat, NR_SHMEM);
}
if (dirty && mapping_cap_account_dirty(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
__inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
}
}
local_irq_enable();
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page_move_mapping);
大致扫一下,就知道这个函数就干了这些事:把文件页从mapping->i_pages这颗树上摘下来,然后,把page->index以及page->mapping传递给newpage,接着把newpage插入到radix树里面,并在migrate_page函数里面完成copy动作,文件页的迁移,就这么完成了。
总结一下一开始的问题。首先,slab申请的页不会挂到lru链表上,也不会作为ksm的目标,因而也就是不会被规整(或者说页不会被迁移),所以不用担心slab会乱套;此外,不可移动页确实也是能迁移的,但是有特殊的要求,就是lru链表上的页,而lru链表上的不可移动页,我理解就只有文件页了,而文件页并没有做什么映射动作,只是挂在文件address_space的radix树里面,所以,只需要把old page取下来,替换成newpage上去即可。
完