linux页框分配与释放

最新推荐文章于 2025-09-19 11:18:14 发布

原创最新推荐文章于 2025-09-19 11:18:14 发布 · 2k 阅读

8 ·

CC 4.0 BY-SA版权

本文详细解析了Linux内核中的页分配与释放机制，包括快速分配路径、从选定内存域分配页、联合页等概念，同时介绍了页释放过程及伙伴系统的运作原理。

1. 页的分配 2

1.1 Alloc fast path 2

1.1.1从选定内存域分配页 3

1.1.2 Alloc Fallbacks 5

页的分配

1.1 Alloc fast path

alloc_pages--->alloc_pages_node--->__alloc_pages_node --->__alloc_pages--->

__alloc_pages_nodemask--->get_page_from_freelist

static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        const struct alloc_context *ac)
{
    struct zoneref *z = ac->preferred_zoneref;
    struct zone *zone;
    struct pglist_data *last_pgdat_dirty_limit = NULL;
//遍历ac->zonelist中不大于ac->high_zoneidx的所有zone
    for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                ac->nodemask) {
        struct page *page;
        unsigned long mark;
//如果使能cpuset而且设置了ALLOC_CPUSET标志就检查看当前CPU是否允许在内存域zone所在结点中分配内存
        if (cpusets_enabled() &&
            (alloc_flags & ALLOC_CPUSET) &&
            !__cpuset_zone_allowed(zone, gfp_mask))
                continue;
// ac->spread_dirty_pages不为零标识本次内存分配用于写，可能增加赃页数
        if (ac->spread_dirty_pages) {
//如果当前zone所在节点被标记为赃页超标就跳过
            if (last_pgdat_dirty_limit == zone->zone_pgdat)
                continue;
//检查zone所在节点赃页数是否超过限制
            if (!node_dirty_ok(zone->zone_pgdat)) {
                last_pgdat_dirty_limit = zone->zone_pgdat;
                continue;
            }
        }
//获取分配所用的水印
        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
	//检查zone中空闲内存是否在水印之上
        if (!zone_watermark_fast(zone, order, mark,
                       ac_classzone_idx(ac), alloc_flags)) {
            int ret;

//如果设置无忽略水印标志就尝试从当前选定zone中分配内存
            if (alloc_flags & ALLOC_NO_WATERMARKS)
                goto try_this_zone;
//程序走到这里说明空闲页在水印之下，接下来需要做内存回收，但是下面两种情况除外1，如果系统不允许内存回收；2）如果目标zone和当前zone的distance不小于RECLAIM_DISTANCE
            if (node_reclaim_mode == 0 ||
                !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                continue;
//函数node_reclaim做内存回收
            ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
            switch (ret) {
//没有扫描，设置禁止扫描标志等等
            case NODE_RECLAIM_NOSCAN: 
                continue;
//没有可以回收的页了
            case NODE_RECLAIM_FULL:
                continue;
            default:
//回收了部分页再次检查看是否满足水印限制
                if (zone_watermark_ok(zone, order, mark,
                        ac_classzone_idx(ac), alloc_flags))
                    goto try_this_zone;

                continue;
            }
        }
//程序到这里说明选定的zone里有空闲内存
try_this_zone:
        page = rmqueue(ac->preferred_zoneref->zone, zone, order,
                gfp_mask, alloc_flags, ac->migratetype);//尝试内存分配
        if (page) {
	//清除一些标志或者设置联合页等等
            prep_new_page(page, order, gfp_mask, alloc_flags);
            return page;
        }
    }

    return NULL;
}

1.1.1从选定内存域分配页

1，如果oder为0表示分配单页，这个时候就从pcplist中分配（冷热页）

2，如果设置了ALLOC_HARDER表示一次高优先级的分配，就从前一类型为MIGRATE_HIGHATOMIC的链表中分配。MIGRATE_HIGHATOMIC类型的页用于一些紧急情况下的内存分配。

3，如果都不是前面的情况就尝试从指定迁移类型migratetype的链表中去分配

4，如果在指定迁移类型链表中没有仍然没有分配到就尝试从其他迁移类型的链表中去偷取

分配冷热页

static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
            bool cold, struct per_cpu_pages *pcp,
            struct list_head *list)
{
    struct page *page;

    do {
        if (list_empty(list)) { 
//如果指定类型链表为空就从伙伴系统中分配一批（pcp->batch）放到pcp->lists[migratetype]中
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);
            if (unlikely(list_empty(list)))
                return NULL;
        }
//如果设置了__GFP_COLD就从链表头取一页，否者从链表尾取一页
        if (cold)
            page = list_last_entry(list, struct page, lru);
        else
            page = list_first_entry(list, struct page, lru);

        list_del(&page->lru); //将页从pcp->lists[migratetype]中删除
        pcp->count--;
    } while (check_new_pcp(page));

    return page;
}

到指定迁移类型的伙伴系统链表中分配页

函数__rmqueue_smallest从指定迁移类型migratetype中去分配order阶的页块。如果order阶对应的链表没有空闲块就从更大阶的链表中去分配，将更大的页块拆解将剩余部分挂到对应order的链表中去。

static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area *area;
    struct page *page;
//从指定阶到MAX_ORDER的伙伴链表中去查找迁移类型为migratetype的空闲页块
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {
        area = &(zone->free_area[current_order]); //取order阶的free_area
        page = list_first_entry_or_null(&area->free_list[migratetype],
                            struct page, lru);//取free_area中指定迁移类型的页块
        if (!page)
            continue; //如果area->free_list[migratetype]为空，就从更大的阶去获取
        list_del(&page->lru); //将页块从对应阶的链表中删除
        rmv_page_order(page);//清除伙伴系统标志，设置页阶为0
        area->nr_free--; //对应阶的free_area中空闲页块计数减一
//将current_order阶的页拆分成小块并重新放到对应的链表中去
        expand(zone, page, order, current_order, area, migratetype);
//设置页的迁移类型为page->index = migratetype。
        set_pcppage_migratetype(page, migratetype);
        return page;
    }

    return NULL;
}

low：为希望分配的阶，high：实际发现有空闲页的阶；area：为high对应的free_area；migratetype：指定的迁移类型

static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{   
    unsigned long size = 1 << high;
//如果high大于 low说明在需要拆分高阶页块来满足本次内存分配
    while (high > low) {//循环拆分大页块直到与low一样大
        area--;
        high--;
        size >>= 1;
//将大块拆分成两块，将后半块重新放到伙伴系统中
        list_add(&page[size].lru, &area->free_list[migratetype]);
        area->nr_free++; //增加统计计数
        set_page_order(&page[size], high); //设置页块阶数
    }
}

1.1.2 Alloc Fallbacks

static inline bool
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
    struct free_area *area;
    unsigned int current_order;
    struct page *page;
    int fallback_mt;
    bool can_steal;
//尝试找到一个尽可能大的可用页块
    for (current_order = MAX_ORDER-1;
                current_order >= order && current_order <= MAX_ORDER-1;
                --current_order) {
        area = &(zone->free_area[current_order]);
//外面循环指定current_order ，下面函数循环取出fallbacks[migratetype][i]备用迁移类型，根据这两个//信息尝试找出合适盗取页块，并返回迁移类型
        fallback_mt = find_suitable_fallback(area, current_order,
                start_migratetype, false, &can_steal);
        if (fallback_mt == -1) //如果没有找到适合就查找下一个order
            continue;
        page = list_first_entry(&area->free_list[fallback_mt],
                        struct page, lru);
//到这里已经找到合适的页块，下面函数判断是直接试盗取（改变整个页块的迁移类型），还是借用（分配但不改变页块迁移类型）
        steal_suitable_fallback(zone, page, start_migratetype,
                                can_steal);
        return true;
    }
    return false;
}

查找备用迁移类型

备用迁移类型fallbacks是一个二维数组，表示如果指定迁移类型分配失败，其他可用的迁移类型列表

static int fallbacks[MIGRATE_TYPES][4] = {
    [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
    [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
    [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
    [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
#endif      
#ifdef CONFIG_MEMORY_ISOLATION
    [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
#endif      
};

int find_suitable_fallback(struct free_area *area, unsigned int order,
            int migratetype, bool only_stealable, bool *can_steal)
{
    int i;
    int fallback_mt;
    if (area->nr_free == 0)//首先指定order的列表中要有空闲页块
        return -1;
    *can_steal = false;
    for (i = 0;; i++) { //循环备用列表
        fallback_mt = fallbacks[migratetype][i];
        if (fallback_mt == MIGRATE_TYPES)//达到列表末尾就退出
            break;

        if (list_empty(&area->free_list[fallback_mt]))
            continue; //如果当前迁移类型列表为空
//判断是否可以盗用该页块，判断条件：1）order >= pageblock_order / 2；2）迁移类型为 MIGRATE_RECLAIMABLE；3）迁移类型为MIGRATE_UNMOVABLE
        if (can_steal_fallback(order, migratetype))
            *can_steal = true;
        if (!only_stealable)
            return fallback_mt;
        if (*can_steal)
            return fallback_mt;
    }
    return -1;
}
static void steal_suitable_fallback(struct zone *zone, struct page *page,
                    int start_type, bool whole_block)
{
    unsigned int current_order = page_order(page);
    struct free_area *area;
    int free_pages, movable_pages, alike_pages;
    int old_block_type;
    //保存页块的迁移类型
    old_block_type = get_pageblock_migratetype(page);
    //如果页块是用于紧急内存分配的页（MIGRATE_HIGHATOMIC）就不能改变其迁移类型
    if (is_migrate_highatomic(old_block_type))
        goto single_page;
/* 如果选定的页块大于pageblock_order （MAX_ORDER – 1）就改变整页块的迁移类型，迁移类型设
置的大小为pageblock_order，所以这里需要用change_pageblock_range改变多个pageblock_order的
迁移类型*/
    if (current_order >= pageblock_order) {
        change_pageblock_range(page, current_order, start_type);
        goto single_page;
    }
// 参数whole_block 是前面函数find_suitable_fallback返回的，表示是否适合盗用整页块
    if (!whole_block)
        goto single_page;
    //统计页块在伙伴系统中的页和不在伙伴系统中并且类型为MOVABLE的页数量并且删除在伙伴系统中的页。
    free_pages = move_freepages_block(zone, page, start_type,
                        &movable_pages);
// alike_pages为与类型start_type兼容的页的数量
    if (start_type == MIGRATE_MOVABLE) {
        alike_pages = movable_pages;
    } else {
        if (old_block_type == MIGRATE_MOVABLE)
            alike_pages = pageblock_nr_pages
                        - (free_pages + movable_pages);
        else
            alike_pages = 0;
    }
    //页块跨越了zone边界可能出现free_pages为0，这种情况不能改变页块的迁移类型
    if (!free_pages)
        goto single_page;
//如果空闲页和与我们分配类型兼容页数量大于整个页块的一半就改变整个页块的迁移类型
    if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
            page_group_by_mobility_disabled)
	//通过修改页块在zone-> pageblock_flags中对应bit来修改页块的迁移类型
        set_pageblock_migratetype(page, start_type); 
    return;

single_page:
    area = &zone->free_area[current_order]; 
    list_move(&page->lru, &area->free_list[start_type]); //将页块从之前的迁移类型列表中移动到迁移类型为start_type的列表中，然后返回到函数__rmqueue中，跳转到retry去尝试重新分配
}

1.1.3联合页

前面讲解了通过调用函数rmqueue来做页分配，如果分配成功，在返回页之前要对页做一些预处理，比如分配标志gfp_flags中设置了__GFP_COMP请求多个页，就需要将分配到的多个页组合成复合页。第一页称为首页，其余页都是尾页，具体组合逻辑如下：

void prep_compound_page(struct page *page, unsigned int order)
{   
    int i;
    int nr_pages = 1 << order;
// 指定析构函数在数组compound_page_dtors[]中的索引
    set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
//设置复合页的order
    set_compound_order(page, order);
//设置标志PG_head将第一个页设置为首页
    __SetPageHead(page);
//轮询第二个页之后的所有页将其设置为尾页
    for (i = 1; i < nr_pages; i++) {
        struct page *p = page + i;
        set_page_count(p, 0);
        p->mapping = TAIL_MAPPING; //标识为尾页
        set_compound_head(p, page);//指向首页
    }
    atomic_set(compound_mapcount_ptr(page), -1);
}

1.2 Alloc slowpath

前面快速分配内存没有成功下面通过各种途径尝试分配到所需内存，慢速分配步骤如下：

降低水印ALLOC_WMARK_MIN，如果设置了__GFP_KSWAPD_RECLAIM就唤醒交换线程；
调用get_page_from_freelist尝试重新分配；
如果分配的页阶大于0尝试内存压缩，通过内存迁移合并出较大的内存块，然后尝试内存分配；
如果设置了__GFP_KSWAPD_RECLAIM再次唤醒交换线程，确保交换线程不会意外睡去；
直接进行内存回收之后尝试分配；
如果内存回收没有分配到所需内存，就直接进行内存压缩之后尝试分配；
检查分配标志是否存在一些潜在可调的空间，然后再次调用get_page_from_freelist尝试份分配；
如果没有回收到足够的内存就尝试杀死一些进程然后尝试分配内存；
如果仍然没有分配到内存，分配标志中设置了__GFP_NOFAIL就设置ALLOC_HARDER尝试做内存分配。

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                        struct alloc_context *ac)
{
 ……

retry_cpuset:
    compaction_retries = 0;
    no_progress_loops = 0;
    compact_priority = DEF_COMPACT_PRIORITY;
    cpuset_mems_cookie = read_mems_allowed_begin();

 //降低水印ALLOC_WMARK_MIN重新构建分配标志
    alloc_flags = gfp_to_alloc_flags(gfp_mask);
    ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                    ac->high_zoneidx, ac->nodemask);
    if (!ac->preferred_zoneref->zone)
        goto nopage;
//如果设置了__GFP_KSWAPD_RECLAIM唤醒交换线程
    if (gfp_mask & __GFP_KSWAPD_RECLAIM)
        wake_all_kswapds(order, ac);
//使用调整之后的分配标志尝试重新分配
    page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    if (page)
        goto got_pg;

/*前面没有分配到内存可能由于内存碎片的缘故，调用函数__alloc_pages_direct_compact尝试内存压缩，进行页的迁移，然后尝试分配*/
    if (can_direct_reclaim &&
            (costly_order ||
               (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
            && !gfp_pfmemalloc_allowed(gfp_mask)) {
        page = __alloc_pages_direct_compact(gfp_mask, order,
                        alloc_flags, ac,
                        INIT_COMPACT_PRIORITY,
                        &compact_result);
        if (page)
            goto got_pg;

retry:
//确保交换线程没有意外睡去
    if (gfp_mask & __GFP_KSWAPD_RECLAIM)
        wake_all_kswapds(order, ac);
//对gfp_mask进行分析看是否可以无水印分配
    if (gfp_pfmemalloc_allowed(gfp_mask))
        alloc_flags = ALLOC_NO_WATERMARKS;

//尝试内存分配，分配标志可能还有一些潜在的调整空间
    page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
    if (page)
        goto got_pg;
    if (!can_direct_reclaim)
        goto nopage;

//下面将做内存回收，如果分配的进程就是内存回收进程就退出避免递归
    if (current->flags & PF_MEMALLOC)
        goto nopage;

//直接内存回收然后分配所需内存
    page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
                            &did_some_progress);
    if (page)
        goto got_pg;

//内存压缩之后后分配内存
    page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
                    compact_priority, &compact_result);
    if (page)
        goto got_pg;

//如果设置__GFP_NORETRY就停止retry
    if (gfp_mask & __GFP_NORETRY)
        goto nopage;

 //分析是否应该重新做内存回收
    if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                 did_some_progress > 0, &no_progress_loops))
        goto retry;

//检查是否有必要做内存压缩
    if (did_some_progress > 0 &&
            should_compact_retry(ac, order, alloc_flags,
                compact_result, &compact_priority,
                &compaction_retries))
        goto retry;
//尝试杀死一些进程来满足当前内存分配
    page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
    if (page)
        goto got_pg;
nopage:
//如果设置了__GFP_NOFAIL，就使用ALLOC_HARDER做内存分配
    if (gfp_mask & __GFP_NOFAIL) {
        if (WARN_ON_ONCE(!can_direct_reclaim))
            goto fail;

        page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
        if (page)
            goto got_pg;
got_pg:
    return page;
}

页的释放

函数free_pages用于释放内存页，传入的参数是页虚拟地址和order。首先函数会根据虚拟地址得到对应的page结构，然后判断order是否为0，如果为0就释放到per cpu lists中去。如果per cpu lists中缓存的页数大于了其上限pcp->high就释放一批pcp->batch到伙伴系统中去。如果order大于0就直接释放到伙伴系统中。

下面是从per cpu lists中释放一批pcp->batch页到伙伴系统中的实现：

static void free_pcppages_bulk(struct zone *zone, int count,
                    struct per_cpu_pages *pcp)
{
    int migratetype = 0;
    int batch_free = 0;
    bool isolated_pageblocks;
	……
/*per cpu lists中根据迁移类型的不同有多个链表，要释放一批到伙伴系统中去，具体释放哪个链表中的需要做个权衡。这里的思路是越靠后的释放的页越多，依次比前一个链表多1, 这样循环选取直到释放够pcp->batch个页*/
    while (count) { 
        struct page *page;
        struct list_head *list;
//选定要释放的链表，确定需要释放的页数batch_free
        do {
            batch_free++;
            if (++migratetype == MIGRATE_PCPTYPES)
                migratetype = 0;
            list = &pcp->lists[migratetype];
        } while (list_empty(list));

        if (batch_free == MIGRATE_PCPTYPES)
            batch_free = count;
//根据选定链表循环释放batch_free个页
        do {
            int mt; 

            page = list_last_entry(list, struct page, lru);
            list_del(&page->lru);

            mt = get_pcppage_migratetype(page);
            if (unlikely(isolated_pageblocks))
                mt = get_pageblock_migratetype(page);

            if (bulkfree_pcp_prepare(page)) //内存调试所用，做内存poison
                continue;
		//释放一页到伙伴系统
            __free_one_page(page, page_to_pfn(page), zone, 0, mt);
        } while (--count && --batch_free && !list_empty(list));
    }
    spin_unlock(&zone->lock);
}

伙伴系统

cat /proc/buddyinfo可以看到伙伴系统所有内存域每个阶剩余的页块信息：

cat /proc/pagetypeinfo可以看到每个页阶中所有迁移类型中的页块数：

函数__free_one_page中实现了伙伴系统的核心逻辑：

static inline void __free_one_page(struct page *page,
        unsigned long pfn,
        struct zone *zone, unsigned int order,
        int migratetype)
{
    unsigned long combined_pfn;
    unsigned long uninitialized_var(buddy_pfn);
    struct page *buddy;
    unsigned int max_order;
    max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
continue_merging:
while (order < max_order - 1) {
//函数__find_buddy_pfn就一行代码page_pfn ^ (1 << order)，将page_pfn的1 << order位取反。
        buddy_pfn = __find_buddy_pfn(pfn, order);
/*如果page_pfn的1 << order位为1，buddy_pfn与page_pfn相减的结果就是负的(1 << order);如果page_pfn的1 << order位为0，相减的结果就为正的(1 << order)。Buddy是page的伙伴页。以当前页帧实例page为基准，向左或向右偏移(1 << order)。*/
        buddy = page + (buddy_pfn - pfn);
/*函数page_is_buddy主要做四项检查：
1)	伙伴页是否处于一个空洞中
2)	伙伴页是否在伙伴系统中
3)	page和它的伙伴页是否有相同的阶
4)	page和它的伙伴页是否处于相同的内存域中*/
        if (!pfn_valid_within(buddy_pfn))
            goto done_merging;
        if (!page_is_buddy(page, buddy, order))
            goto done_merging;
//将伙伴页从列表中删除，因为它将和其它页合并，然后将首页挂到更高阶的列表中。

        list_del(&buddy->lru);
//递减该阶中的页计数
        zone->free_area[order].nr_free--;
//清除页的“在伙伴系统中”的标志，然后将页块的阶数page->private置为0

        rmv_page_order(buddy);
//将page_idx的1 << order位清零
        combined_pfn = buddy_pfn & pfn;
//求order+1阶页块的首页
        page = page + (combined_pfn - pfn);
        pfn = combined_pfn; //更新page_idx
        order++; //阶数递增
}
   ……
done_merging:
//设置首页的阶数，将首页标记为伙伴系统页
    set_page_order(page, order); 
 /*在order阶向order+1阶的页块合并中失败了，下面是判断如果下次order阶向order+1阶的页块合并能够成功的话，就将order阶页块链接到列表的末尾，避免下次被分配出去，以便以后合并为更高阶的页块。*/ 
   if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
        struct page *higher_page, *higher_buddy;
//求高一阶的页块首页
        combined_pfn = buddy_pfn & pfn;
        higher_page = page + (combined_pfn - pfn);
//找出高一阶页块的伙伴页
        buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
        higher_buddy = higher_page + (buddy_pfn - combined_pfn);
//检查高一阶页是否可以合并，如果可以合并就将order阶页链接到列表尾
        if (pfn_valid_within(buddy_pfn) &&
            page_is_buddy(higher_page, higher_buddy, order + 1)) {
            list_add_tail(&page->lru,
                &zone->free_area[order].free_list[migratetype]);
            goto out;
        }
    }
//如果高一阶页不可合并就将页块链接到列表首
    list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
//增加对应阶的空闲页计数
    zone->free_area[order].nr_free++;
}