__get_free_pages_getfreepages-优快云博客

本文链接：https://blog.youkuaiyun.com/YuZhiHui_No1/article/details/50761516

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
    int node_id;
    int ret;

    /*
     * Zone reclaim reclaims unmapped file backed pages and
     * slab pages if we are over the defined limits.
     *          
     * A small portion of unmapped file backed pages is needed for
     * file I/O otherwise pages read by file I/O will be immediately
     * thrown out if the zone is overallocated. So we do not reclaim
     * if less than a specified percentage of the zone is used by
     * unmapped file backed pages.
     */
    if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&//都小于最小设定的值
        zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
        return ZONE_RECLAIM_FULL;

    if (zone->all_unreclaimable)//设定了标识不回收
        return ZONE_RECLAIM_FULL;
                
    /*`
     * Do not scan if the allocation should not be delayed.
     */     
    if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))//__GFP_WAIT不能等待，而PF_MEMALLOC表示内核管理内存执行公务需要申请页的标识，不能等待，否则可能死锁
        return ZONE_RECLAIM_NOSCAN;
                
    /*      
     * Only run zone reclaim on the local zone or on zones that do not
     * have associated processors. This will favor the local processor
     * over remote processors and spread off node memory allocations
     * as wide as possible.
     */         
    node_id = zone_to_nid(zone);
    if (node_state(node_id, N_CPU) && node_id != numa_node_id())//不属于该cpu范围，不是NUMA  不扫描
        return ZONE_RECLAIM_NOSCAN;
                
    if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))//其他进程在回收，如果没有就上锁，准备回收该zone上的页
        return ZONE_RECLAIM_NOSCAN;

    ret = __zone_reclaim(zone, gfp_mask, order);//回收该zone上的页
    zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);//释放回收锁

    if (!ret)
        count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

    return ret;
}

static long zone_pagecache_reclaimable(struct zone *zone)
{
    long nr_pagecache_reclaimable;
    long delta = 0;

    /*
     * If RECLAIM_SWAP is set, then all file pages are considered
     * potentially reclaimable. Otherwise, we have to worry about
     * pages like swapcache and zone_unmapped_file_pages() provides
     * a better estimate
     */
    if (zone_reclaim_mode & RECLAIM_SWAP)//设置了回收模式，并且设置了RECLAIM_SWAP表示可以把页交换到磁盘上来回收页，则所有页都可以回收
        nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
    else
        nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);//未映射的页 = lru - mapped  ，负数则返回0

    /* If we can't clean pages, remove dirty pages from consideration */
    if (!(zone_reclaim_mode & RECLAIM_WRITE))
        delta += zone_page_state(zone, NR_FILE_DIRTY);

    /* Watch for any possible underflows due to delta */
    if (unlikely(delta > nr_pagecache_reclaimable))
        delta = nr_pagecache_reclaimable;

    return nr_pagecache_reclaimable - delta;
}

</pre><p>-------------------------------------------------------------慢速分配：---------------------------------------------------------------</p><p></p><pre name="code" class="cpp">/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
 * __GFP_FS is also cleared as it implies __GFP_IO.
 */ 
static inline gfp_t memalloc_noio_flags(gfp_t flags)
{           
    if (unlikely(current->flags & PF_MEMALLOC_NOIO))
        flags &= ~(__GFP_IO | __GFP_FS);// __GFP_IO 标识任何的IO操作  __GFP_FS 标识任何的文件系统调用
    return flags;
}

page = __alloc_pages_slowpath(gfp_mask, order,
                zonelist, high_zoneidx, nodemask,
                preferred_zone, migratetype);

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
    struct zonelist *zonelist, enum zone_type high_zoneidx,
    nodemask_t *nodemask, struct zone *preferred_zone,
    int migratetype)
{
    const gfp_t wait = gfp_mask & __GFP_WAIT;//得到表明是否可以等待
    struct page *page = NULL;
    int alloc_flags;
    unsigned long pages_reclaimed = 0;
    unsigned long did_some_progress;
    bool sync_migration = false;
    bool deferred_compaction = false;
    bool contended_compaction = false;

    /*
     * In the slowpath, we sanity check order to avoid ever trying to
     * reclaim >= MAX_ORDER areas which will never succeed. Callers may
     * be using allocators in order of preference for an area that is
     * too large.
     */
    if (order >= MAX_ORDER) {
        WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
        return NULL;
    }

    /*
     * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
     * __GFP_NOWARN set) should not cause reclaim since the subsystem
     * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
     * using a larger set of nodes after it has established that the
     * allowed per node queues are empty and that nodes are
     * over allocated.
     */
    if (IS_ENABLED(CONFIG_NUMA) &&
            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
        goto nopage;

restart:
    if (!(gfp_mask & __GFP_NO_KSWAPD))
        wake_all_kswapd(order, zonelist, high_zoneidx,
                        zone_idx(preferred_zone));

    /*
     * OK, we're below the kswapd watermark and have kicked background
     * reclaim. Now things get more complex, so set up alloc_flags according
     * to how we want to proceed.
     */
    alloc_flags = gfp_to_alloc_flags(gfp_mask);

    /*
     * Find the true preferred zone if the allocation is unconstrained by
     * cpusets.
     */
    if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
        first_zones_zonelist(zonelist, high_zoneidx, NULL,
                    &preferred_zone);

rebalance:
    /* This is the last chance, in general, before the goto nopage. */
    page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
            high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
            preferred_zone, migratetype);
    if (page)
        goto got_pg;

    /* Allocate without watermarks if the context allows */
    if (alloc_flags & ALLOC_NO_WATERMARKS) {
        /*
         * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
         * the allocation is high priority and these type of
         * allocations are system rather than user orientated
         */
        zonelist = node_zonelist(numa_node_id(), gfp_mask);

        page = __alloc_pages_high_priority(gfp_mask, order,
                zonelist, high_zoneidx, nodemask,
                preferred_zone, migratetype);
        if (page) {
            goto got_pg;
        }
    }

    /* Atomic allocations - we can't balance anything */
    if (!wait)//不能等待，则失败返回
        goto nopage;

    /* Avoid recursion of direct reclaim */
    if (current->flags & PF_MEMALLOC)//系统执行公务需要分配页，这是不能等太久的，否则系统会陷入短暂的死机
        goto nopage;

    /* Avoid allocations with no watermarks from looping endlessly */
    if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
        goto nopage;

    /*
     * Try direct compaction. The first pass is asynchronous. Subsequent
     * attempts after direct reclaim are synchronous
     */
    page = __alloc_pages_direct_compact(gfp_mask, order,
                    zonelist, high_zoneidx,
                    nodemask,
                    alloc_flags, preferred_zone,
                    migratetype, sync_migration,
                    &contended_compaction,
                    &deferred_compaction,
                    &did_some_progress);
    if (page)
        goto got_pg;
    sync_migration = true;

    /*
     * If compaction is deferred for high-order allocations, it is because
     * sync compaction recently failed. In this is the case and the caller
     * requested a movable allocation that does not heavily disrupt the
     * system then fail the allocation instead of entering direct reclaim.
     */
    if ((deferred_compaction || contended_compaction) &&
                        (gfp_mask & __GFP_NO_KSWAPD))
        goto nopage;

    /* Try direct reclaim and then allocating */
    page = __alloc_pages_direct_reclaim(gfp_mask, order,
                    zonelist, high_zoneidx,
                    nodemask,
                    alloc_flags, preferred_zone,
                    migratetype, &did_some_progress);
    if (page)
        goto got_pg;

    /*
     * If we failed to make any progress reclaiming, then we are
     * running out of options and have to consider going OOM
     */
    if (!did_some_progress) {
        if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
            if (oom_killer_disabled)
                goto nopage;
            /* Coredumps can quickly deplete all memory reserves */
            if ((current->flags & PF_DUMPCORE) &&
                !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
            page = __alloc_pages_may_oom(gfp_mask, order,
                    zonelist, high_zoneidx,
                    nodemask, preferred_zone,
                    migratetype);
            if (page)
                goto got_pg;

            if (!(gfp_mask & __GFP_NOFAIL)) {
                /*
                 * The oom killer is not called for high-order
                 * allocations that may fail, so if no progress
                 * is being made, there are no other options and
                 * retrying is unlikely to help.
                 */
                if (order > PAGE_ALLOC_COSTLY_ORDER)
                    goto nopage;
                /*
                 * The oom killer is not called for lowmem
                 * allocations to prevent needlessly killing
                 * innocent tasks.
                 */
                if (high_zoneidx < ZONE_NORMAL)
                    goto nopage;
            }

            goto restart;
        }
    }

    /* Check if we should retry the allocation */
    pages_reclaimed += did_some_progress;
    if (should_alloc_retry(gfp_mask, order, did_some_progress,
                        pages_reclaimed)) {
        /* Wait for some write requests to complete then retry */
        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        goto rebalance;
    } else {
        /*
         * High-order allocations do not necessarily loop after
         * direct reclaim and reclaim/compaction depends on compaction
         * being called after reclaim so call directly if necessary
         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                    zonelist, high_zoneidx,
                    nodemask,
                    alloc_flags, preferred_zone,
                    migratetype, sync_migration,
                    &contended_compaction,
                    &deferred_compaction,
                    &did_some_progress);
        if (page)
            goto got_pg;
    }

nopage:
    warn_alloc_failed(gfp_mask, order, NULL);
    return page;
got_pg:
    if (kmemcheck_enabled)
        kmemcheck_pagealloc_alloc(page, order, gfp_mask);

    return page;
}

static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
                        enum zone_type high_zoneidx,
                        enum zone_type classzone_idx)
{
    struct zoneref *z;
    struct zone *zone;

    for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)//从zonelist中的列表中选择最合适的zone进行交互
        wakeup_kswapd(zone, order, classzone_idx);
}

static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
    int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

    /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
    BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

    /*
     * The caller may dip into page reserves a bit more if the caller
     * cannot run direct reclaim, or if the caller has realtime scheduling
     * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
{    int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

    /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
    BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

    /*     * The caller may dip into page reserves a bit more if the caller
     * cannot run direct reclaim, or if the caller has realtime scheduling     * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
     * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
     */
    alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
    if (atomic) {
        /*         * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
         * if it can't schedule.         */
        if (!(gfp_mask & __GFP_NOMEMALLOC))            alloc_flags |= ALLOC_HARDER;
        /*
         * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
         * comment for __cpuset_node_allowed_softwall().
         */
        alloc_flags &= ~ALLOC_CPUSET;    } else if (unlikely(rt_task(current)) && !in_interrupt())
        alloc_flags |= ALLOC_HARDER;

    if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {        if (gfp_mask & __GFP_MEMALLOC)
            alloc_flags |= ALLOC_NO_WATERMARKS;        else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
            alloc_flags |= ALLOC_NO_WATERMARKS;
        else if (!in_interrupt() &&
                ((current->flags & PF_MEMALLOC) ||                 unlikely(test_thread_flag(TIF_MEMDIE))))
            alloc_flags |= ALLOC_NO_WATERMARKS;    }
#ifdef CONFIG_CMA    if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;
#endif
    return alloc_flags;
}

static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
    struct zonelist *zonelist, enum zone_type high_zoneidx,
    nodemask_t *nodemask, struct zone *preferred_zone,
    int migratetype)
{
    struct page *page;

    do {//再分配
        page = get_page_from_freelist(gfp_mask, nodemask, order,
            zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
            preferred_zone, migratetype);

        if (!page && gfp_mask & __GFP_NOFAIL)//__GFP_NOFALL 设置了，则定时等待把数据写入设备中，空出页来
            wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
    } while (!page && (gfp_mask & __GFP_NOFAIL));//死循环，直到分配到了页为止

    return page;
}

#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
    struct zonelist *zonelist, enum zone_type high_zoneidx,
    nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
    int migratetype, bool sync_migration,
    bool *contended_compaction, bool *deferred_compaction,
    unsigned long *did_some_progress)
{
    if (!order)
        return NULL;

    if (compaction_deferred(preferred_zone, order)) {
        *deferred_compaction = true;
        return NULL;
    }

    current->flags |= PF_MEMALLOC;
    *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                        nodemask, sync_migration,
                        contended_compaction);
    current->flags &= ~PF_MEMALLOC;

    if (*did_some_progress != COMPACT_SKIPPED) {
        struct page *page;

        /* Page migration frees to the PCP lists but we want merging */
        drain_pages(get_cpu());
        put_cpu();

        page = get_page_from_freelist(gfp_mask, nodemask,
                order, zonelist, high_zoneidx,
                alloc_flags & ~ALLOC_NO_WATERMARKS,
                preferred_zone, migratetype);
        if (page) {
            preferred_zone->compact_blockskip_flush = false;
            preferred_zone->compact_considered = 0;
            preferred_zone->compact_defer_shift = 0;
            if (order >= preferred_zone->compact_order_failed)
                preferred_zone->compact_order_failed = order + 1;
            count_vm_event(COMPACTSUCCESS);
            return page;
        }

        /*
         * It's bad if compaction run occurs and fails.
         * The most likely reason is that pages exist,
         * but not enough to satisfy watermarks.
         */
        count_vm_event(COMPACTFAIL);

        /*
         * As async compaction considers a subset of pageblocks, only
         * defer if the failure was a sync compaction failure.
         */
        if (sync_migration)
            defer_compaction(preferred_zone, order);

        cond_resched();
    }

    return NULL;
}

    /* The preferred zone is used for statistics later */
    first_zones_zonelist(zonelist, high_zoneidx,
                nodemask ? : &cpuset_current_mems_allowed,
                &preferred_zone);
    if (!preferred_zone)
        goto out;

#ifdef CONFIG_CMA
    if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;
#endif
    /* First allocation attempt */
    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
            zonelist, high_zoneidx, alloc_flags,
            preferred_zone, migratetype);
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        gfp_mask = memalloc_noio_flags(gfp_mask);
        page = __alloc_pages_slowpath(gfp_mask, order,
                zonelist, high_zoneidx, nodemask,
                preferred_zone, migratetype);
    }

    trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
        goto retry_cpuset;

    memcg_kmem_commit_charge(page, memcg, order);

    return page;
}