第8章补：Linux-5.10.110中的内存分配

最新推荐文章于 2026-01-09 23:38:05 发布

原创最新推荐文章于 2026-01-09 23:38:05 发布 · 782 阅读

12 ·

CC 4.0 BY-SA版权

文章标签：

#linux

深入理解Linux内核专栏收录该内容

6 篇文章

订阅专栏

第8章描述了Linux-2.6.11中的内核内存分配。这一章补充描述Linux-5.10.110中的内存分配。主要从三个方面描述。首先是伙伴系统，我们已经看到，他是内核中内存分配的基础，不管是slab还是vmalloc，都需要从伙伴系统中分配内存。然后我们会描述kmem_cache和kmalloc，他们是内核中最常使用的分配内存的手段。最后，我们会描述vmalloc。与kmalloc相比，它主要用于分配大块的内存。

1：数据结构

这一节描述内存管理模块使用的结构体。

1.1：节点node和管理区zone

在现代的计算机内存中，有节点node这一概念。每个节点node又包含了几个内存区zone。每个节点使用结构体struct pglist_data描述。因此，在NUMA系统中，有多个节点，多个节点的struct pglist_data就会组成数组

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;

而在UMA模型中，只有一个节点，因此使用

struct pglist_data __refdata contig_page_data;

来描述。总之，结构体struct pglist_data描述了一个内存结点。

* On NUMA machines, each NUMA node would have a pg_data_t to describe

* it's memory layout. On UMA machines there is a single pglist_data which

* describes the whole memory.

* Memory statistics and page replacement data structures are maintained on a

* per-zone basis.

typedef struct pglist_data {

* node_zones contains just the zones for THIS node. Not all of the

* zones may be populated, but it is the full list. It is referenced by

* this node's node_zonelists as well as other node's node_zonelists.

*///这个节点包含的所有管理区

struct zone node_zones[MAX_NR_ZONES];

* node_zonelists contains references to all zones in all nodes.

* Generally the first zones will be references to this node's

* node_zones.

在UMA中，node_zonelists包含了一个个struct zonelist。而struct zonelist是系统中所有的zone构成的数组

struct zonelist {

struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];

};

struct zonelist node_zonelists[MAX_ZONELISTS];

int nr_zones; /* number of populated zones in this node */

#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */

struct page *node_mem_map;

#ifdef CONFIG_PAGE_EXTENSION

struct page_ext *node_page_ext;

#endif

#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)

* Must be held any time you expect node_start_pfn,

* node_present_pages, node_spanned_pages or nr_zones to stay constant.

* Also synchronizes pgdat->first_deferred_pfn during deferred page

* init.

* pgdat_resize_lock() and pgdat_resize_unlock() are provided to

* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG

* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.

* Nests above zone->lock and zone->span_seqlock

spinlock_t node_size_lock;

#endif

unsigned long node_start_pfn;

unsigned long node_present_pages; /* total number of physical pages */

unsigned long node_spanned_pages; /* total size of physical page

range, including holes */

int node_id;

wait_queue_head_t kswapd_wait;

wait_queue_head_t pfmemalloc_wait;

struct task_struct *kswapd; /* Protected by

mem_hotplug_begin/end() */

int kswapd_order;

enum zone_type kswapd_highest_zoneidx;

int kswapd_failures; /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION

int kcompactd_max_order;

enum zone_type kcompactd_highest_zoneidx;

wait_queue_head_t kcompactd_wait;

struct task_struct *kcompactd;

#endif

* This is a per-node reserve of pages that are not available

* to userspace allocations.

unsigned long totalreserve_pages;

#ifdef CONFIG_NUMA

* node reclaim becomes active if more unmapped pages exist.

unsigned long min_unmapped_pages;

unsigned long min_slab_pages;

#endif /* CONFIG_NUMA */

/* Write-intensive fields used by page reclaim */

ZONE_PADDING(_pad1_)

spinlock_t lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

* If memory initialisation on large machines is deferred then this

* is the first PFN that needs to be initialised.

unsigned long first_deferred_pfn;

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

struct deferred_split deferred_split_queue;

#endif

/* Fields commonly accessed by the page reclaim scanner */

* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.

* Use mem_cgroup_lruvec() to look up lruvecs.

struct lruvec __lruvec;

unsigned long flags;

ZONE_PADDING(_pad2_)

/* Per-node vmstats */

struct per_cpu_nodestat __percpu *per_cpu_nodestats;

atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];

} pg_data_t;

一个节点中包含了多个内存区。内存区使用结构体struct zone描述

struct zone {

/* Read-mostly fields */

/* zone watermarks, access with *_wmark_pages(zone) macros */

unsigned long _watermark[NR_WMARK];

unsigned long watermark_boost;

unsigned long nr_reserved_highatomic;

* We don't know if the memory that we're going to allocate will be

* freeable or/and it will be released eventually, so to avoid totally

* wasting several GB of ram we must reserve some of the lower zone

* memory (otherwise we risk to run OOM on the lower zones despite

* there being tons of freeable ram on the higher zones). This array is

* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl

* changes.

long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NEED_MULTIPLE_NODES

int node;

#endif

struct pglist_data *zone_pgdat;

struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM

* Flags for a pageblock_nr_pages block. See pageblock-flags.h.

* In SPARSEMEM, this map is stored in struct mem_section

unsigned long *pageblock_flags;

#endif /* CONFIG_SPARSEMEM */

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

unsigned long zone_start_pfn;

* spanned_pages is the total pages spanned by the zone, including

* holes, which is calculated as:

* spanned_pages = zone_end_pfn - zone_start_pfn;

* present_pages is physical pages existing within the zone, which

* is calculated as:

* present_pages = spanned_pages - absent_pages(pages in holes);

* managed_pages is present pages managed by the buddy system, which

* is calculated as (reserved_pages includes pages allocated by the

* bootmem allocator):

* managed_pages = present_pages - reserved_pages;

* So present_pages may be used by memory hotplug or memory power

* management logic to figure out unmanaged pages by checking

* (present_pages - managed_pages). And managed_pages should be used

* by page allocator and vm scanner to calculate all kinds of watermarks

* and thresholds.

* Locking rules:

* zone_start_pfn and spanned_pages are protected by span_seqlock.

* It is a seqlock because it has to be read outside of zone->lock,

* and it is done in the main allocator path. But, it is written

* quite infrequently.

* The span_seq lock is declared along with zone->lock because it is

* frequently read in proximity to zone->lock. It's good to

* give them a chance of being in the same cacheline.

* Write access to present_pages at runtime should be protected by

* mem_hotplug_begin/end(). Any reader who can't tolerant drift of

* present_pages should get_online_mems() to get a stable value.

atomic_long_t managed_pages;

unsigned long spanned_pages;

unsigned long present_pages;

const char *name;

int initialized;

/* Write-intensive fields used from the page allocator */

ZONE_PADDING(_pad1_)

/* free areas of different sizes */

struct free_area free_area[MAX_ORDER];

/* zone flags, see below */

unsigned long flags;

/* Primarily protects free_area */

spinlock_t lock;

/* Write-intensive fields used by compaction and vmstats. */

ZONE_PADDING(_pad2_)

* When free pages are below this point, additional steps are taken

* when reading the number of free pages to avoid per-cpu counter

* drift allowing watermarks to be breached

unsigned long percpu_drift_mark;

bool contiguous;

ZONE_PADDING(_pad3_)

/* Zone statistics */

atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];

atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];

} ____cacheline_internodealigned_in_smp;

1.2：struct alloc_context

结构体struct alloc_context用于描述从伙伴系统分配物理内存的行为。

* in mm/page_alloc.c

* Structure for holding the mostly immutable allocation parameters passed

* between functions involved in allocations, including the alloc_pages*

* family of functions.

* nodemask, migratetype and highest_zoneidx are initialized only once in

* __alloc_pages_nodemask() and then never change.

* zonelist, preferred_zone and highest_zoneidx are set first in

* __alloc_pages_nodemask() for the fast path, and might be later changed

* in __alloc_pages_slowpath(). All other functions pass the whole structure

* by a const pointer.

struct alloc_context {

struct zonelist *zonelist;

nodemask_t *nodemask;

struct zoneref *preferred_zoneref;

int migratetype;

* highest_zoneidx represents highest usable zone index of

* the allocation request. Due to the nature of the zone,

* memory on lower zone than the highest_zoneidx will be

* protected by lowmem_reserve[highest_zoneidx].

* highest_zoneidx is also used by reclaim/compaction to limit

* the target zone since higher zone than this index cannot be

* usable for this allocation request.

enum zone_type highest_zoneidx;

bool spread_dirty_pages;

};

2：伙伴系统

伙伴系统的对外接口是alloc_page或者alloc_pages。它的作用是，指定分配的页数目(1<<order)和分配的标志位，分配连续的物理内存页，并且返回首页的page结构。

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)

{

return alloc_pages_node(numa_node_id(), gfp_mask, order); //在UMA系统中，numa_node_id()返回0

}

* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,

* prefer the current CPU's closest node. Otherwise node must be valid and

* online.

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,

unsigned int order)

{

if (nid == NUMA_NO_NODE)

nid = numa_mem_id();

return __alloc_pages_node(nid, gfp_mask, order);

}

* Allocate pages, preferring the node given as nid. The node must be valid and

* online. For more general interface, see alloc_pages_node().

static inline struct page *

__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)

{

VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);

VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));

return __alloc_pages(gfp_mask, order, nid);

}

static inline struct page *

__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)

{

return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);

}

上面只是简单的调用流程，没有有价值的地方。我们终于进入了函数__alloc_pages_nodemask。这是伙伴系统的核心。

2.1：伙伴系统的核心——__alloc_pages_nodemask

函数__alloc_pages_nodemask的实现如下：

* This is the 'heart' of the zoned buddy allocator.

struct page *

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,

nodemask_t *nodemask)

{

struct page *page;

unsigned int alloc_flags = ALLOC_WMARK_LOW;

gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

struct alloc_context ac = { };

* There are several places where we assume that the order value is sane

* so bail out early if the request is out of bound.

if (unlikely(order >= MAX_ORDER)) {

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

return NULL;

}

gfp_mask &= gfp_allowed_mask;

alloc_mask = gfp_mask;

if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) //初始化struct alloc_context结构体

return NULL;

* Forbid the first pass from falling back to types that fragment

* memory until all local zones are considered.

alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);

/* First allocation attempt *///第一次分配，快速分配

page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

if (likely(page))

goto out;

* Apply scoped allocation constraints. This is mainly about GFP_NOFS

* resp. GFP_NOIO which has to be inherited for all allocation requests

* from a particular context which has been marked by

* memalloc_no{fs,io}_{save,restore}.

alloc_mask = current_gfp_context(gfp_mask);

ac.spread_dirty_pages = false;

* Restore the original nodemask if it was potentially replaced with

* &cpuset_current_mems_allowed to optimize the fast-path attempt.

ac.nodemask = nodemask;

page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:

return page;

}

这个函数主要包含了三个部分：

1：prepare_alloc_pages

2：get_page_from_freelist

3：__alloc_pages_slowpath

2.2：prepare_alloc_pages

这个函数的实现如下，它主要填充了结构体struct alloc_context

* in mm/page_alloc.c

* Structure for holding the mostly immutable allocation parameters passed

* between functions involved in allocations, including the alloc_pages*

* family of functions.

* nodemask, migratetype and highest_zoneidx are initialized only once in

* __alloc_pages_nodemask() and then never change.

* zonelist, preferred_zone and highest_zoneidx are set first in

* __alloc_pages_nodemask() for the fast path, and might be later changed

* in __alloc_pages_slowpath(). All other functions pass the whole structure

* by a const pointer.

*///从注释中，我们知道，成员zonelist, preferred_zone and highest_zoneidx在快速分配和慢速分配中，会发生改变

struct alloc_context {

struct zonelist *zonelist;

nodemask_t *nodemask;

struct zoneref *preferred_zoneref;

int migratetype;

* highest_zoneidx represents highest usable zone index of

* the allocation request. Due to the nature of the zone,

* memory on lower zone than the highest_zoneidx will be

* protected by lowmem_reserve[highest_zoneidx].

* highest_zoneidx is also used by reclaim/compaction to limit

* the target zone since higher zone than this index cannot be

* usable for this allocation request.

*///这个枚举变量的取值可能是ZONE_DMA，ZONE_DMA32，ZONE_NORMAL，ZONE_HIGHMEM，ZONE_MOVABLE。这个值限制了可以使用的zone的idx最大值

enum zone_type highest_zoneidx;

bool spread_dirty_pages;

};

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

int preferred_nid, nodemask_t *nodemask,

struct alloc_context *ac, gfp_t *alloc_mask,

unsigned int *alloc_flags)

{

ac->highest_zoneidx = gfp_zone(gfp_mask); //根据gfp_mask的最后4bit，我们可以获得从哪些内存区中分配内存。例如，GFP_HIGHUSER_MOVABLE包含了__GFP_HIGHMEM| __GFP_MOVABLE。在GFP_ZONE_TABLE中，对应了ZONE_MOVABLE。也就是说，分配内存的时候，内存区最高使用到MOVABLE

ac->zonelist = node_zonelist(preferred_nid, gfp_mask); //每个节点node的struct zonelist node_zonelists[MAX_ZONELISTS]成员都存放了所有的内存区的zone数据，但是每个node的node_zonelists最开始都是自己的zone。并且每个节点node的struct zonelist node_zonelists[MAX_ZONELISTS]是从高级zone向低级排的，也就是说，这里zonelist可能是这个node的NORMAL,DMA32,DMA这个顺序

ac->nodemask = nodemask;

ac->migratetype = gfp_migratetype(gfp_mask); //根据gfp_mask中是否设置__GFP_RECLAIMABLE与__GFP_MOVABLE，设置migratetype为enum migratetype中的值

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

if (should_fail_alloc_page(gfp_mask, order))

return false;

*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);

/* Dirty zone balancing only done in the fast path */

ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

* The preferred zone is used for statistics but crucially it is

* also used as the starting point for the zonelist iterator. It

* may get reset for allocations that ignore memory policies.

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, //我们拿到最开始进行分配的zone。要求这个zoneref的zone_idx <= highest_zoneidx。例如，我们使用GFP_KERNEL宏，我们会按照NORMAL，DMA32，DMA的顺序分配，所以这里拿到的是NORMAL

ac->highest_zoneidx, ac->nodemask);

return true;

}

2.3：get_page_from_freelist

get_page_from_freelist是我们第一次分配。说明一下各参数的含义。gfp_mask：我们传入的gfp_mask；order：我们传入的order；alloc_flags：包含水位，能否做内存回收等信息；ac：包含要使用的内存区信息等。函数prepare_alloc_pages实现如下：

static struct page *

get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

const struct alloc_context *ac)

{

struct zoneref *z;

struct zone *zone;

struct pglist_data *last_pgdat_dirty_limit = NULL;

bool no_fallback;

retry:

* Scan zonelist, looking for a zone with enough free.

* See also __cpuset_node_allowed() comment in kernel/cpuset.c.

no_fallback = alloc_flags & ALLOC_NOFRAGMENT;

z = ac->preferred_zoneref;

for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, //这里遍历所有能够做内存分配的zone

ac->nodemask) {

struct page *page;

unsigned long mark;

……//这部分内容我们不关注

mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);//我们获得这次分配使用的水位线。在快速分配的时候，我们使用的是LOW水位线。这时候，mark是页数

if (!zone_watermark_fast(zone, order, mark, //这里判断zone能够满足分配条件。满足的话返回true，从而进入rmqueue进行内存分配。判断的标准就是，分配完成后，剩下的内存应该大于LOW水线的一个相关值

ac->highest_zoneidx, alloc_flags,

gfp_mask)) {

int ret;

/* Checked here to keep the fast path fast */

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

if (alloc_flags & ALLOC_NO_WATERMARKS)

goto try_this_zone;

continue; //我们假设我们是UMA模型。这种情况下一定会走到continue

…… //在NUMA模型中，后面的流程无效，我们不关注

}

try_this_zone://流程走到这里，表示我们已经找到了一个满足水位要求的zone，或者我们设置了ALLOC_NO_WATERMARKS，也就是不考虑水位

page = rmqueue(ac->preferred_zoneref->zone, zone, order, //使用rmqueue从伙伴系统中分配内存

gfp_mask, alloc_flags, ac->migratetype);

if (page) {

prep_new_page(page, order, gfp_mask, alloc_flags); //设置page->_refcount=1，清->private=0

* If this is a high-order atomic allocation then check

* if the pageblock should be reserved for the future

if (unlikely(order && (alloc_flags & ALLOC_HARDER)))

reserve_highatomic_pageblock(page, zone, order);

return page;

} //我们选择了一个内存区，但是这个内存区也可能分不出来内存。分不出来就继续做大循环

}

//我们遍历了所有的zone，还是没能找到合适的内存区，返回NULL

* It's possible on a UMA machine to get through all zones that are

* fragmented. If avoiding fragmentation, reset and try again.

if (no_fallback) {

alloc_flags &= ~ALLOC_NOFRAGMENT;

goto retry;

}

return NULL;

}

因此，在第一次快速分配的时候，我们扫描zone，看他是否满足条件。如果满足的话，就通过函数rmqueue进行分配。否则返回NULL。而这里是否满足条件，和水线LOW有强相关关系。当水线很高的时候，很可能我们不能通过快速路径分出来内存。

注意，在快速路径中，我们不会进行内存回收。

2.4：__alloc_pages_slowpath

如果我们在第一次快速分配的时候，没有成功，那么就会进行慢速分配的流程。这里的参数，gfp_mask：我们传入的gfp_mask；order：我们传入的order；ac：包含要使用的内存区信息等。

static inline struct page *

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

struct alloc_context *ac)

{

bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;

const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; //order > 3?当order比较小的时候，内存分配不受内存碎片的干扰，因此无法分配内存的时候，不应该尝试内存压缩(迁移)

struct page *page = NULL;

unsigned int alloc_flags;

unsigned long did_some_progress;

enum compact_priority compact_priority;

enum compact_result compact_result;

int compaction_retries;

int no_progress_loops;

unsigned int cpuset_mems_cookie;

int reserve_flags;

* We also sanity check to catch abuse of atomic reserves being used by

* callers that are not in atomic context.

*///__GFP_ATOMIC表示分配内存是原子操作。在此过程中不能进行内存回收

if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==

(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))

gfp_mask &= ~__GFP_ATOMIC;

retry_cpuset:

compaction_retries = 0;

no_progress_loops = 0;

compact_priority = DEF_COMPACT_PRIORITY;

* The fast path uses conservative alloc_flags to succeed only until

* kswapd needs to be woken up, and to avoid the cost of setting up

* alloc_flags precisely. So we do that now.

*///相较于快速路径，慢速路径中对alloc_flags做了以下修改：1：水位值使用MIN；2：如果gfp_mask设置__GFP_HIGH(高优先级)或__GFP_KSWAPD_RECLAIM(允许唤醒kswapd)，在alloc_flags中设置相应的位；3：如果gfp_mask设置__GFP_ATOMIC或者当前进程是实时进程，且不在中断中，设置ALLOC_HARDER

alloc_flags = gfp_to_alloc_flags(gfp_mask);

* We need to recalculate the starting point for the zonelist iterator

* because we might have used different nodemask in the fast path, or

* there was a cpuset modification and we are retrying - otherwise we

* could end up iterating over non-eligible zones endlessly.

*///我们重新设置preferred_zoneref的值

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

ac->highest_zoneidx, ac->nodemask);

if (!ac->preferred_zoneref->zone)

goto nopage;

if (alloc_flags & ALLOC_KSWAPD)//如果gfp_mask设置了__GFP_KSWAPD_RECLAIM，这里就会唤醒kswapd回收内存。注意有__GFP_KSWAPD_RECLAIM(通过kswapd)或者__GFP_RECLAIM(直接回收两种形式)

wake_all_kswapds(order, gfp_mask, ac);

* The adjusted alloc_flags might result in immediate success, so try

* that first

*///我们使用了MIN水位，并且唤醒了kswapd。这里直接分配一下，看能不能分出来

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

if (page)

goto got_pg;

* For costly allocations, try direct compaction first, as it's likely

* that we have enough base pages and don't need to reclaim. For non-

* movable high-order allocations, do that as well, as compaction will

* try prevent permanent fragmentation by migrating from blocks of the

* same migratetype.

* Don't try this for allocations that are allowed to ignore

* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.

*///下面都是和内存压缩有关的代码。我们暂时不关注内存压缩

if (can_direct_reclaim &&

(costly_order ||

(order > 0 && ac->migratetype != MIGRATE_MOVABLE))

&& !gfp_pfmemalloc_allowed(gfp_mask)) {

page = __alloc_pages_direct_compact(gfp_mask, order,

alloc_flags, ac,

INIT_COMPACT_PRIORITY,

&compact_result);

if (page)

goto got_pg;

* Checks for costly allocations with __GFP_NORETRY, which

* includes some THP page fault allocations

if (costly_order && (gfp_mask & __GFP_NORETRY)) {

* If allocating entire pageblock(s) and compaction

* failed because all zones are below low watermarks

* or is prohibited because it recently failed at this

* order, fail immediately unless the allocator has

* requested compaction and reclaim retry.

* Reclaim is

* - potentially very expensive because zones are far

* below their low watermarks or this is part of very

* bursty high order allocations,

* - not guaranteed to help because isolate_freepages()

* may not iterate over freed pages as part of its

* linear scan, and

* - unlikely to make entire pageblocks free on its

* own.

if (compact_result == COMPACT_SKIPPED ||

compact_result == COMPACT_DEFERRED)

goto nopage;

* Looks like reclaim/compaction is worth trying, but

* sync compaction could be very expensive, so keep

* using async compaction.

compact_priority = INIT_COMPACT_PRIORITY;

}

retry://我们已经使用了MIN水位。但是还是分不出来内存

/* Ensure kswapd doesn't accidentally go to sleep as long as we loop *///这里再唤醒一下kswapd进程

if (alloc_flags & ALLOC_KSWAPD)

wake_all_kswapds(order, gfp_mask, ac);

reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);//如果当前进程是要释放内存的进程，那么会设置ALLOC_NO_WATERMARKS，不再使用水位线

if (reserve_flags)

alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);

* Reset the nodemask and zonelist iterators if memory policies can be

* ignored. These allocations are high priority and system rather than

* user oriented.

if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {

ac->nodemask = NULL;

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

ac->highest_zoneidx, ac->nodemask);

}

/* Attempt with potentially adjusted zonelist and alloc_flags *///再次更改水位线，或者使用kswapd释放内存后，再次分配内存

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

if (page)

goto got_pg;

/* Caller is not willing to reclaim, we can't balance anything *///这个进程不愿意直接回收内存

if (!can_direct_reclaim)

goto nopage;

/* Avoid recursion of direct reclaim *///这个进程就是回收内存的进程。我们避免他一直重复在回收内存的过程中

if (current->flags & PF_MEMALLOC)

goto nopage;

/* Try direct reclaim and then allocating */

page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,//我们直接回收内存后，看能否分配出内存

&did_some_progress);

if (page)

goto got_pg;

/* Try direct compaction and then allocating */

page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);

if (page)

goto got_pg;

/* Do not loop if specifically requested *///经过内存回收以及压缩后，我们还是无法成功分配内存。如果gfp_mask & __GFP_NORETRY，就不会做oom kill等操作

if (gfp_mask & __GFP_NORETRY)

goto nopage;

* Do not retry costly high order allocations unless they are

* __GFP_RETRY_MAYFAIL

*///

if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))

goto nopage;

if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,

did_some_progress > 0, &no_progress_loops))

goto retry; //重新做reclaim操作。这里面会判断no_progress_loops是否大于MAX_RECLAIM_RETRIES(16)？如果是的话，已经做了16次没有进展的回收和压缩操作了，进OOM吧

* It doesn't make any sense to retry for the compaction if the order-0

* reclaim is not able to make any progress because the current

* implementation of the compaction depends on the sufficient amount

* of free memory (see __compaction_suitable)

if (did_some_progress > 0 &&

should_compact_retry(ac, order, alloc_flags, //重新做compact操作

compact_result, &compact_priority,

&compaction_retries))

goto retry;

/* Reclaim has failed us, start killing things *///oom_killer机制

page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);

if (page)

goto got_pg;

/* Avoid allocations with no watermarks from looping endlessly */

if (tsk_is_oom_victim(current) &&

(alloc_flags & ALLOC_OOM ||

(gfp_mask & __GFP_NOMEMALLOC)))

goto nopage;

/* Retry as long as the OOM killer is making progress *///oom已经杀死了一些进程。我们这时候再次尝试内存分配

if (did_some_progress) {

no_progress_loops = 0;

goto retry;

}

nopage:

* Make sure that __GFP_NOFAIL request doesn't leak out and make sure

* we always retry

if (gfp_mask & __GFP_NOFAIL) { //设置了__GFP_NOFAIL，我们最终还会继续retry，始终不会返回NULL

* All existing users of the __GFP_NOFAIL are blockable, so warn

* of any new users that actually require GFP_NOWAIT

if (WARN_ON_ONCE(!can_direct_reclaim))

goto fail;

* PF_MEMALLOC request from this context is rather bizarre

* because we cannot reclaim anything and only can loop waiting

* for somebody to do a work for us

WARN_ON_ONCE(current->flags & PF_MEMALLOC);

* non failing costly orders are a hard requirement which we

* are not prepared for much so let's warn about these users

* so that we can identify them and convert them to something

* else.

WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

* Help non-failing allocations by giving them access to memory

* reserves but do not use ALLOC_NO_WATERMARKS because this

* could deplete whole memory reserves which would just make

* the situation worse

page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);

if (page)

goto got_pg;

cond_resched();

goto retry;

}

fail:

warn_alloc(gfp_mask, ac->nodemask,

"page allocation failure: order:%u", order);

got_pg:

return page;

}

因此，在慢速流程中，我们会通过kswapd，或者直接进行内存回收(前提是设置了__GFP_DIRECT_RECLAIM)

3：kmalloc

我们已经无需说明kmalloc的功能了。直接看它的实现。

/**

* kmalloc - allocate memory

* @size: how many bytes of memory are required.

* @flags: the type of memory to allocate.

* kmalloc is the normal method of allocating memory

* for objects smaller than page size in the kernel.

* The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN

* bytes. For @size of power of two bytes, the alignment is also guaranteed

* to be at least to the size.

* The @flags argument may be one of the GFP flags defined at

* include/linux/gfp.h and described at

* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`

* The recommended usage of the @flags is described at

* :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`

* Below is a brief outline of the most useful GFP flags

* %GFP_KERNEL

* Allocate normal kernel ram. May sleep.

* %GFP_NOWAIT

* Allocation will not sleep.

* %GFP_ATOMIC

* Allocation will not sleep. May use emergency pools.

* %GFP_HIGHUSER

* Allocate memory from high memory on behalf of user.

* Also it is possible to set different flags by OR'ing

* in one or more of the following additional @flags:

* %__GFP_HIGH

* This allocation has high priority and may use emergency pools.

* %__GFP_NOFAIL

* Indicate that this allocation is in no way allowed to fail

* (think twice before using).

* %__GFP_NORETRY

* If memory is not immediately available,

* then give up at once.

* %__GFP_NOWARN

* If allocation fails, don't issue any warnings.

* %__GFP_RETRY_MAYFAIL

* Try really hard to succeed the allocation but fail

* eventually.

static __always_inline void *kmalloc(size_t size, gfp_t flags)

{

return __kmalloc(size, flags);

}

void *__kmalloc(size_t size, gfp_t flags)

{

struct kmem_cache *s;

void *ret;

if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) //KMALLOC_MAX_CACHE_SIZE= 1 >> 13

return kmalloc_large(size, flags);

s = kmalloc_slab(size, flags); //根据size，选择一个合适的slub

if (unlikely(ZERO_OR_NULL_PTR(s)))

return s;

ret = slab_alloc(s, flags, _RET_IP_);

trace_kmalloc(_RET_IP_, ret, size, s->size, flags);

ret = kasan_kmalloc(s, ret, size, flags);

return ret;

}

系统中，和slub相关的数据结构关系图如下所示：

这张图描述了很多的信息。

1：kmem_cache是slub的管理结构。

2：slub作为伙伴系统的缓存，它自身也分成多级，kmem_cache_cpu.page->kmem_cache_cpu.partical->kmem_cache.node.partical。可以认为，分配内存的时候首先从kmem_cache_cpu.page中分配；分不出来，找这个cpu对应的kmem_cache_cpu.partical，将一个partical中包含的page赋值给kmem_cache_cpu.page，做分配；还是分不出来，找到这个node对应的kmem_cache.node.partical，将最开始有空闲对象的page赋值给kmem_cache_cpu.page，再额外将一些page添加到kmem_cache_cpu.partical中。如果还是分不出来，那么就从伙伴系统中做分配。

3：作为kmem_cache_cpu.page，它的成员具有以下特点：page->inuse=page->objects；page->frozen=1；page->freelist=NULL。

4：作为kmem_cache_cpu.partical，它的成员具有以下特点：page->inuse是真正使用了的对象数目；page->frozen=1；page->freelist是第一个空闲对象的地址。

3.1：创建一个slub

使用函数kmem_cache_create创建一个slub。这个函数只创建并初始化了描述slub的struct kmem_cache结构体，还并没有真正的从伙伴系统中给slub分配内存。我们首先看一下struct kmem_cache的数据结构

* Slab cache management.

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

/* Used for retrieving partial slabs, etc. */

slab_flags_t flags;

unsigned long min_partial;

unsigned int size; /* The size of an object including metadata */

unsigned int object_size;/* The size of an object without metadata */

struct reciprocal_value reciprocal_size;

unsigned int offset; /* Free pointer offset */

#ifdef CONFIG_SLUB_CPU_PARTIAL

/* Number of per cpu partial objects to keep around */

unsigned int cpu_partial;

#endif

struct kmem_cache_order_objects oo;

/* Allocation and freeing of slabs */

struct kmem_cache_order_objects max;

struct kmem_cache_order_objects min;

gfp_t allocflags; /* gfp flags to use on each alloc */

int refcount; /* Refcount for slab cache destroy */

void (*ctor)(void *);

unsigned int inuse; /* Offset to metadata */

unsigned int align; /* Alignment */

unsigned int red_left_pad; /* Left redzone padding size */

const char *name; /* Name (only for display!) */

struct list_head list; /* List of slab caches */

unsigned int useroffset; /* Usercopy region offset */

unsigned int usersize; /* Usercopy region size */

struct kmem_cache_node *node[MAX_NUMNODES];

};

函数kmem_cache_create实现如下：

/**

* kmem_cache_create - Create a cache.

* @name: A string which is used in /proc/slabinfo to identify this cache.//slabinfo中显示的名字

* @size: The size of objects to be created in this cache.//slab中每个对象的大小

* @align: The required alignment for the objects.//slab中每个对象的对其要求

* @flags: SLAB flags//下面有可能的取值以及它们的含义，一般无需关注

* @ctor: A constructor for the objects.//当一个页被分配给slub时，执行的初始化函数

* Cannot be called within a interrupt, but can be interrupted.

* The @ctor is run when new pages are allocated by the cache.

* The flags are

* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)

* to catch references to uninitialised memory.

* %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check

* for buffer overruns.

* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware

* cacheline. This can be beneficial if you're counting cycles as closely

* as davem.

* Return: a pointer to the cache on success, NULL on failure.

struct kmem_cache *

kmem_cache_create(const char *name, unsigned int size, unsigned int align,

slab_flags_t flags, void (*ctor)(void *))

{

return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,

ctor);

}

struct kmem_cache *

kmem_cache_create_usercopy(const char *name,

unsigned int size, unsigned int align,

slab_flags_t flags,

unsigned int useroffset, unsigned int usersize,

void (*ctor)(void *))

{

struct kmem_cache *s = NULL;

const char *cache_name;

int err;

get_online_cpus();

get_online_mems();

mutex_lock(&slab_mutex);

/* Refuse requests with allocator specific flags */

if (flags & ~SLAB_FLAGS_PERMITTED) {

err = -EINVAL;

goto out_unlock;

}

* Some allocators will constraint the set of valid flags to a subset

* of all flags. We expect them to define CACHE_CREATE_MASK in this

* case, and we'll just provide them with a sanitized version of the

* passed flags.

flags &= CACHE_CREATE_MASK;

if (!usersize)

s = __kmem_cache_alias(name, size, align, flags, ctor); //在此函数中，通过调用find_mergeable，寻找已有的slub中，是否有一个可以和当前要创建的slub合并

if (s)

goto out_unlock;

cache_name = kstrdup_const(name, GFP_KERNEL); //如果name在用户态地址空间，那么将name拷贝到内核态地址空间

if (!cache_name) {

err = -ENOMEM;

goto out_unlock;

}

//如果没有找到能够merge的slub，我们就新创建一个slub

s = create_cache(cache_name, size,

calculate_alignment(flags, align, size),

flags, useroffset, usersize, ctor, NULL);

if (IS_ERR(s)) {

err = PTR_ERR(s);

kfree_const(cache_name);

}

out_unlock:

mutex_unlock(&slab_mutex);

put_online_mems();

put_online_cpus();

return s;

}

static struct kmem_cache *create_cache(const char *name,

unsigned int object_size, unsigned int align,

slab_flags_t flags, unsigned int useroffset,

unsigned int usersize, void (*ctor)(void *),

struct kmem_cache *root_cache)

{

struct kmem_cache *s;

int err;

s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); //从slub中分配一个kmem_cache

if (!s)

goto out;

s->name = name;

s->size = s->object_size = object_size; //object_size就是slub中，每个结构体的大小

s->align = align;

s->ctor = ctor;

s->useroffset = useroffset;

s->usersize = usersize;

//上面这些参数的值可以通过/sys/kernel/slab/下的文件查看

err = __kmem_cache_create(s, flags);

if (err)

goto out_free_cache;

s->refcount = 1;

list_add(&s->list, &slab_caches); //slab_cached是所有slub组成的链表

out:

if (err)

return ERR_PTR(err);

return s;

}

3.1.1：__kmem_cache_create

__kmem_cache_create在slab,slub,slob中有不同的实现。现在一般使用slub。我们看这个函数在slub.c中的实现。

int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)

{

int err;

err = kmem_cache_open(s, flags);

if (err)

return err;

/* Mutex is not taken during early boot */

if (slab_state <= UP)

return 0;

err = sysfs_slab_add(s); //这个slub已经初始化完成。在sys/kernel/slab中添加对应的文件。之后，我们就可以在这个文件中看和这个slub相关的信息

if (err)

__kmem_cache_release(s);

return err;

}

这个函数调用kmem_cache_open，实现如下

static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)

{

s->flags = kmem_cache_flags(s->size, flags, s->name);

if (!calculate_sizes(s, -1)) //设置slub->order，返回一个slub需要多少个page，对应多少个对象

goto error;

* The larger the object size is, the more pages we want on the partial

* list to avoid pounding the page allocator excessively.

set_min_partial(s, ilog2(s->size) / 2); //这个值可以在/sys/kernel/slab/<slab_name>/ min_partial中查看。设置了kmem_cache.min_partial。当kmem_cache.kmem_cache_node中的page数目大于min_partial，就会释放slub(也就是一个slub包含的order页)

set_cpu_partial(s); //设置struct kmem_cache -> unsigned int cpu_partial。在第3节的图中我们看到，如果kmem_cache_cpu中，partical链表链接的page中的空闲slub对象数目大于cpu_partial，就会将这些page移动到kmem_cache.kmem_cache_node的partial链表中

if (!init_kmem_cache_nodes(s)) //创建并初始化kmem_cache.kmem_cache_node结构体

goto error;

if (alloc_kmem_cache_cpus(s))

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

……}

struct kmem_cache_cpu是一个每cpu变量。也就是每个cpu都有一个。这个结构体的内容如下

struct kmem_cache_cpu {

void **freelist; /* Pointer to next available object */

unsigned long tid; /* Globally unique transaction id */

struct page *page; /* The slab from which we are allocating */

};

这里分配这个结构体的内存。在后面适当的时候，会给这个结构体赋值

return 0;

error:

__kmem_cache_release(s);

return -EINVAL;

}

3.1.2：calculate_sizes

函数calculate_sizes决定了一个slub的order，以及数据在slub中的分布。order表示了每个slub占用的内存大小(1<<order)。也就是说，一个slub可能占据多个连续的物理页

* calculate_sizes() determines the order and the distribution of data within

* a slab object.

static int calculate_sizes(struct kmem_cache *s, int forced_order)

{

slab_flags_t flags = s->flags;

unsigned int size = s->object_size;

unsigned int order;

* Round up object size to the next word boundary. We can only

* place the free pointer at word boundaries and this determines

* the possible location of the free pointer.

size = ALIGN(size, sizeof(void *));

* With that we have determined the number of bytes in actual use

* by the object and redzoning.

s->inuse = size;

if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||

((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||

s->ctor) {

……

} else {

* Store freelist pointer near middle of object to keep

* it away from the edges of the object to avoid small

* sized over/underflows from neighboring allocations.

s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); //slub->offset是偏移量。一个slub中包含了多个对象。我们可以用未分配的slub对象存放下一个未分配的slub对象的首地址，类似形成一个链表。offset就是相对于空闲object的首地址，存放下一个空闲object的首地址的地址的偏移

}

* SLUB stores one object immediately after another beginning from

* offset 0. In order to align the objects we have to simply size

* each object to conform to the alignment.

size = ALIGN(size, s->align);

s->size = size;

s->reciprocal_size = reciprocal_value(size);

if (forced_order >= 0)

order = forced_order;

else

order = calculate_order(size); //根据slub的每个对象的大小size，计算得到slab使用多少个页。当我们调用函数new_slab_objects给一个struct kmem_cache_cpu分配新的page的时候，就会从伙伴系统中分配1<<order个page

if ((int)order < 0)

return 0;

s->allocflags = 0;

if (order)

s->allocflags |= __GFP_COMP;

if (s->flags & SLAB_CACHE_DMA)

s->allocflags |= GFP_DMA;

if (s->flags & SLAB_CACHE_DMA32)

s->allocflags |= GFP_DMA32;

if (s->flags & SLAB_RECLAIM_ACCOUNT)

s->allocflags |= __GFP_RECLAIMABLE;

* Determine the number of objects per slab

s->oo = oo_make(order, size); //oo记录了slub的order，还记录了一个slub总可以容纳多少个对象

s->min = oo_make(get_order(size), size);

if (oo_objects(s->oo) > oo_objects(s->max))

s->max = s->oo;

return !!oo_objects(s->oo);

}

3.2：给slub分配内存

我们在上面的内容中，创建了一个slub。但是，这个slub还没有内存。因此，我们在使用这个slub的时候，会给这个slub分配页，再从这个slub中分配对象。

上面这个过程的调用链是：kmalloc -> __kmalloc -> slab_alloc -> slab_alloc_node -> __slab_alloc -> ___slab_alloc -> new_slab_objects。也就是说，当要给slub从伙伴系统分配新的page的时候，调用函数new_slab_objects。

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,

int node, struct kmem_cache_cpu **pc)

{

void *freelist;

struct kmem_cache_cpu *c = *pc;

struct page *page;

WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

freelist = get_partial(s, flags, node, c);

if (freelist)

return freelist;

//上面的内容会在3.3：kmalloc分配内存中讨论。上面的流程没有走到伙伴系统中做分配

page = new_slab(s, flags, node); //从伙伴系统中分配page给slub

if (page) {

c = raw_cpu_ptr(s->cpu_slab);

if (c->page)

flush_slab(s, c);

* No other reference to the page yet so we can

* muck around with it freely without cmpxchg

freelist = page->freelist; //freelist指向了page对应的物理页的虚拟地址首地址，也是存放第一个空闲的slub对象的首地址

page->freelist = NULL;

stat(s, ALLOC_SLAB);

c->page = page; //page和freelist是对应的。freelist就是page描述的页面的虚拟地址首地址

*pc = c;

}

return freelist;

}

static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)

{

return allocate_slab(s,

flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);

}

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)

{

struct page *page;

struct kmem_cache_order_objects oo = s->oo;

gfp_t alloc_gfp;

void *start, *p, *next;

int idx;

bool shuffle;

flags &= gfp_allowed_mask;

if (gfpflags_allow_blocking(flags))

local_irq_enable();

flags |= s->allocflags;

* Let the initial higher-order allocation fail under memory pressure

* so we fall-back to the minimum order allocation.

alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;

if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))

alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);

page = alloc_slab_page(s, alloc_gfp, node, oo); //从伙伴系统中分配出oo_order(oo)个页

page->objects = oo_objects(oo); //一个slub可能包含多个page，这个slub包含的总对象数目放入objects中

page->slab_cache = s;

__SetPageSlab(page);

if (page_is_pfmemalloc(page))

SetPageSlabPfmemalloc(page);

start = page_address(page); //通过page，获得它对应的虚拟地址。这种地址都是直接映射的地址(物理地址和虚拟地址之间是线性关系)。这种虚拟地址从0xffff888000000000开始

if (1) {

start = fixup_red_left(s, start);

start = setup_object(s, page, start);

page->freelist = start; //因此，page->freeelist就是page对应的虚拟地址首地址

for (idx = 0, p = start; idx < page->objects - 1; idx++) {

next = p + s->size;

next = setup_object(s, page, next);

set_freepointer(s, p, next); //将*(p+s->offset) = next。next是下一个空闲的对象块。将他的地址存放在上一个空闲对象块+s->offset处

p = next;

}

set_freepointer(s, p, NULL); //最后一个slub对象的p+s->offset = NULL

}

page->inuse = page->objects;

page->frozen = 1;

out:

if (gfpflags_allow_blocking(flags))

local_irq_disable();

if (!page)

return NULL;

inc_slabs_node(s, page_to_nid(page), page->objects);

return page;

}

因此，我们知道了slub是如何管理空闲对象的分配的。

1：struct kmem_cache -> struct kmem_cache_cpu -> freelist 表示了当前slub中空闲对象的地址。

2：struct kmem_cache -> struct kmem_cache_cpu -> freelist + struct kmem_cache ->offset 表示了当前slub中下一个空闲对象的地址

3：每次做分配操作的时候，分配的对象是struct kmem_cache -> struct kmem_cache_cpu -> freelist ，然后将freelist指向下一个空闲对象，也就是 freelist + offset。

4：每次做释放操作的时候，将freelist指向释放的对象，然后freelist + offset 变成之前freelist的地址。

3.3：kmalloc分配内存

这一节我们描述slub是如何进行内存分配的。我们已经知道，内存分配的接口是kmalloc或者kmem_cache_alloc。他们调用链为：

kmalloc -> __kmalloc -> slab_alloc

kmem_cache_alloc -> slab_alloc

函数slab_alloc的实现如下

static __always_inline void *slab_alloc(struct kmem_cache *s,

gfp_t gfpflags, unsigned long addr)

{

return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);

}

我们在创建一个kmem_cache_t的时候，会给每个cpu创建单独的slub成员链表，并组织在每cpu变量struct kmem_cache_cpu中。这样的好处是避免多cpu之间并发带来的锁开销。重申struct kmem_cache_cpu结构体定义如下：

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

……

};

struct kmem_cache_cpu {

void **freelist; /* Pointer to next available object */

unsigned long tid; /* Globally unique transaction id */

struct page *page; /* The slab from which we are allocating */

};

因此，我们要给slub分配内存，并且根据分配的内存赋值struct kmem_cache_cpu中的成员。

3.3.1：slab_alloc_node

函数slab_alloc_node实现如下

//和伙伴系统中的分配类似，这里也有快速路径以及慢速路径两种分配方式
static __always_inline void *slab_alloc_node(struct kmem_cache *s,

gfp_t gfpflags, int node, unsigned long addr)

{

void *object;

struct kmem_cache_cpu *c;

struct page *page;

unsigned long tid;

struct obj_cgroup *objcg = NULL;

s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);

if (!s)

return NULL;

redo:

* Must read kmem_cache cpu data via this cpu ptr. Preemption is

* enabled. We may switch back and forth between cpus while

* reading from one cpu area. That does not matter as long

* as we end up on the original cpu again when doing the cmpxchg.

* We should guarantee that tid and kmem_cache are retrieved on

* the same cpu. It could be different if CONFIG_PREEMPTION so we need

* to check if it is matched or not.

do {

tid = this_cpu_read(s->cpu_slab->tid);

c = raw_cpu_ptr(s->cpu_slab);

} while (IS_ENABLED(CONFIG_PREEMPTION) &&

unlikely(tid != READ_ONCE(c->tid)));

//已经说过，struct kmem_cache_cpu结构体存储了当前cpu对应的slub对象链表

* Irqless object alloc/free algorithm used here depends on sequence

* of fetching cpu_slab's data. tid should be fetched before anything

* on c to guarantee that object and page associated with previous tid

* won't be used with current tid. If we fetch tid first, object and

* page could be one associated with next tid and our alloc/free

* request will be failed. In this case, we will retry. So, no problem.

barrier();

* The transaction ids are globally unique per cpu and per operation on

* a per cpu queue. Thus they can be guarantee that the cmpxchg_double

* occurs on the right processor and that there was no operation on the

* linked list in between.

object = c->freelist; //freelist是这个cpu的slub中，空闲对象的首地址

page = c->page;

if (unlikely(!object || !page || !node_match(page, node))) {

object = __slab_alloc(s, gfpflags, node, addr, c); //慢速分配流程

} else { //快速分配流程。这时候freelist就是空闲对象

void *next_object = get_freepointer_safe(s, object); //这个宏展开就是：

(void *)*(unsigned long *)(object + s->offset)。我们已经说过，slub中用未分配的slub存放下一个未分配的slub对象的地址,offset就是偏移。因此，这里拿到了下一个空闲object的首地址

* The cmpxchg will only match if there was no additional

* operation and if we are on the right processor.

* The cmpxchg does the following atomically (without lock

* semantics!)

* 1. Relocate first pointer to the current per cpu area.

* 2. Verify that tid and freelist have not been changed

* 3. If they were not changed replace tid and freelist

* Since this is without lock semantics the protection is only

* against code executing on this cpu *not* from access by

* other cpus.

*///this_cpu_cmpxchg_double比较参数1，2和参数3，4是否相等。如果是的话，将参数5，6设置给参数1，2.其实就是将next_object设置给s->cpu_slab->freelist

if (unlikely(!this_cpu_cmpxchg_double(

s->cpu_slab->freelist, s->cpu_slab->tid,

object, tid,

next_object, next_tid(tid)))) {

note_cmpxchg_failure("slab_alloc", s, tid);

goto redo;

}

prefetch_freepointer(s, next_object);

stat(s, ALLOC_FASTPATH);

}

maybe_wipe_obj_freeptr(s, object);

if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)

memset(object, 0, s->object_size);

slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);

return object;

}

因此，在快速路径下，我们直接拿到struct kmem_cache -> struct kmem_cache_cpu ->freelist，作为这次分配出来的slub对象。

3.3.2：___slab_alloc

除了上面的快速路径，还有慢速路径。并且，当第一次做slub分配的时候，也会走到慢速路径。这时候，struct kmem_cache_cpu上没有空闲的slub待使用。

static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

unsigned long addr, struct kmem_cache_cpu *c)

{

void *p;

unsigned long flags;

local_irq_save(flags);

#ifdef CONFIG_PREEMPTION

* We may have been preempted and rescheduled on a different

* cpu before disabling interrupts. Need to reload cpu area

* pointer.

*///重新获取struct kmem_cache_cpu

c = this_cpu_ptr(s->cpu_slab);

#endif

p = ___slab_alloc(s, gfpflags, node, addr, c);

local_irq_restore(flags);

return p;

}

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

unsigned long addr, struct kmem_cache_cpu *c)

{

void *freelist;

struct page *page;

stat(s, ALLOC_SLOWPATH);

page = c->page; //c->page可以看作是slub当前正在使用的page。freelist应该是和page配套的

if (!page) { //当slub刚刚创建，还没有对应的内存的时候，page为NULL

* if the node is not online or has no normal memory, just

* ignore the node constraint

if (unlikely(node != NUMA_NO_NODE &&

!node_state(node, N_NORMAL_MEMORY)))

node = NUMA_NO_NODE;

goto new_slab;

}

redo:

/* must check again c->freelist in case of cpu migration or IRQ *///如果这时候，freelist不为空，可能是在进入慢速路径的时候，这个上面又有了空闲的slub。从注释看，可能是中断或者进程迁移造成的

freelist = c->freelist;

if (freelist)

goto load_freelist;

freelist = get_freelist(s, page); //这里的page是struct kmem_cache_cpu.page。因为new_slab中有redo，所以这时候的page可能是进入此函数时，struct kmem_cache_cpu就带有的page，或者struct kmem_cache_cpu.partical上的page。这个函数最重要的作用就是返回page->freelist，并将page->freelist=NULL

if (!freelist) { //这种情况下，说明struct kmem_cache_cpu.page也不包含空闲对象

c->page = NULL;

stat(s, DEACTIVATE_BYPASS);

goto new_slab; //分配新的page

}

stat(s, ALLOC_REFILL);

load_freelist:

* freelist is pointing to the list of objects to be used.

* page is pointing to the page from which the objects are obtained.

* That page must be frozen for per cpu allocations to work.

VM_BUG_ON(!c->page->frozen);

c->freelist = get_freepointer(s, freelist); //返回第一个空闲对象，将freelist设置为第二个空闲对象

c->tid = next_tid(c->tid);

return freelist;

new_slab: //给kmem_cache_cpu创建新的slub(分配kmem_cache.order个页)

if (slub_percpu_partial(c)) { //如果kmem_cache_cpu.partical上有page的话，就从这些page上面分。partical是部分分配的页

page = c->page = slub_percpu_partial(c); //将c->page设置为c->partical

slub_set_percpu_partial(c, page); //将c->partical设置为之前partical页的下一页

stat(s, CPU_PARTIAL_ALLOC);

//将struct kmem_cache_cpu.page设置为struct kmem_cache_cpu.partical。然后将struct kmem_cache_cpu.partical设置为next page

goto redo;

}

//我们走到这里，一定是遍历了kmem_cache_cpu.partical中所有的页，都没有找到空闲对象。这时候必须分配新的slub

freelist = new_slab_objects(s, gfpflags, node, &c); //这里就会通过伙伴系统给slub分配新的page，返回的freelist就是slub对象的指针

if (unlikely(!freelist)) {

slab_out_of_memory(s, gfpflags, node);

return NULL;

}

page = c->page;

if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))

goto load_freelist; //在分配新的page之后，这里会跳回到load_freelist处执行。然后，就会将c->freelist = get_freepointer(s, freelist)，也就是第二个空闲slub对象的地址

/* Only entered in the debug case */

if (kmem_cache_debug(s) &&

!alloc_debug_processing(s, page, freelist, addr))

goto new_slab; /* Slab failed checks. Next slab needed */

deactivate_slab(s, page, get_freepointer(s, freelist), c);

return freelist;

}

除了上面描述的每个cpu的struct kmem_cache_cpu链接的partical链表之外，还有一个partical链表，也就是每个node的partical链表。我们在函数___slab_alloc中搜索了所有的struct kmem_cache_cpu链接的partical链表。之后，我们会在new_slab_objects -> get_partial中搜索每个node的partical链表。

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,

int node, struct kmem_cache_cpu **pc)

{

void *freelist;

struct kmem_cache_cpu *c = *pc;

struct page *page;

WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

freelist = get_partial(s, flags, node, c); //从kmem_cache_node找空闲对象

if (freelist)

return freelist;

page = new_slab(s, flags, node); //还是没有的话，就是kmem_cache_cpu和kmem_cache_node中都没有空闲对象了，所以要从伙伴系统中分配。这个函数已经在3.2节中描述过了

if (page) {

c = raw_cpu_ptr(s->cpu_slab);

if (c->page)

flush_slab(s, c);

* No other reference to the page yet so we can

* muck around with it freely without cmpxchg

freelist = page->freelist;

page->freelist = NULL;

stat(s, ALLOC_SLAB);

c->page = page;

*pc = c;

}

return freelist;

}

static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,

struct kmem_cache_cpu *c)

{

void *object;

int searchnode = node;

if (node == NUMA_NO_NODE)

searchnode = numa_mem_id();

object = get_partial_node(s, get_node(s, searchnode), c, flags); //找我们struct kmem_cache对应当前node的partial链表中，是否有空闲对象。返回找到的第一个空闲对象

if (object || node != NUMA_NO_NODE)

return object;

return get_any_partial(s, flags, c); //找其他NUMA节点中，是否有struct list_head partial链接的page。在UMA中就是return NULL

}

static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,

struct kmem_cache_cpu *c, gfp_t flags)

{

struct page *page, *page2;

void *object = NULL;

unsigned int available = 0;

int objects;

* Racy check. If we mistakenly see no partial slabs then we

* just allocate an empty slab. If we mistakenly try to get a

* partial slab and there is none available then get_partial()

* will return NULL.

*///struct kmem_cache_node->partical是一个空链表的时候，我们直接返回NULL

if (!n || !n->nr_partial)

return NULL;

spin_lock(&n->list_lock);

list_for_each_entry_safe(page, page2, &n->partial, slab_list) { //遍历struct kmem_cache_node的partical链表，每个成员都是一个page

void *t;

t = acquire_slab(s, n, page, object == NULL, &objects); //我们将每个page的空闲对象数目存到objects上(new.objects - new.inuse)，将page从struct kmem_cache_node->partical链表上移除

if (!t)

break;

available += objects; //available是从kmem_cache_node移动到kmem_cache_cpu的空闲对象总数目

if (!object) {

c->page = page; //如果是第一个page，那么设置struct kmem_cache_cpu.page=page。作为后面slub分配使用的page，他遵守1：page.freelist=null；2：page.inuse=.page.objects；3：page.frozen=1

stat(s, ALLOC_FROM_PARTIAL);

object = t;

} else {

put_cpu_partial(s, page, 0); //后续的page不做这种处理，只是将这些page添加到kmem_cache_cpu.partical链表上。因此，会将多个partical页从kmem_cache_node移动到kmem_cache_cpu上

stat(s, CPU_PARTIAL_NODE);

}

if (!kmem_cache_has_cpu_partial(s)

|| available > slub_cpu_partial(s) / 2) //如果从kmem_cache_node移动的可用对象数目大于kmem_cache.cpu_partial/2，我们就不再移动了。重申，kmem_cache.cpu_partial可以在/sys/kernel/slab/X/cpu_partial查看

break;

}

spin_unlock(&n->list_lock);

return object;

}

总结，上面的分配流程有一下三步，也就是从freelist中分配，或者将c->partical中的页赋给c->page，然后再从c->freelist中分配，或者将node->partical中的页分配c->page与c->partical，然后从c->freelist中分配。

3.4：kfree释放内存

函数kfree的实现如下：

void kfree(const void *x)

{

struct page *page;

void *object = (void *)x;

page = virt_to_head_page(x);

slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); //可见，参数分别是slub结构体，page结构体和要释放的对象的首地址

}

static __always_inline void slab_free(struct kmem_cache *s, struct page *page,

void *head, void *tail, int cnt,

unsigned long addr)

{

if (slab_free_freelist_hook(s, &head, &tail, &cnt)) //这个函数不影响正常流程的执行

do_slab_free(s, page, head, tail, cnt, addr);

}

函数do_slab_free的实现如下。他的入参可以认为和slab_free的入参没有变化。

static __always_inline void do_slab_free(struct kmem_cache *s,

struct page *page, void *head, void *tail,

int cnt, unsigned long addr)

{

void *tail_obj = tail ? : head;

struct kmem_cache_cpu *c;

unsigned long tid;

redo:

* Determine the currently cpus per cpu slab.

* The cpu may change afterward. However that does not matter since

* data is retrieved via this pointer. If we are on the same cpu

* during the cmpxchg then the free will succeed.

do {

tid = this_cpu_read(s->cpu_slab->tid);

c = raw_cpu_ptr(s->cpu_slab);

} while (IS_ENABLED(CONFIG_PREEMPTION) &&

unlikely(tid != READ_ONCE(c->tid)));

/* Same with comment on barrier() in slab_alloc_node() */

barrier();

if (likely(page == c->page)) { //我们从分配的一节知道，struct kmem_cache_cpu.page和struct kmem_cache_cpu.freelist是对应的。因此，如果要释放的slub的page和当前的c->page是一样的，那么直接释放到当前的freelist就行。这也是快速释放

void **freelist = READ_ONCE(c->freelist);

set_freepointer(s, tail_obj, freelist); //这时候，让释放的slub对象存放freelist。也就是，我们释放的slub对象放在开头

if (unlikely(!this_cpu_cmpxchg_double( //通过这个函数，设置s->cpu_slab->freelist为head

s->cpu_slab->freelist, s->cpu_slab->tid,

freelist, tid,

head, next_tid(tid)))) {

note_cmpxchg_failure("slab_free", s, tid);

goto redo;

}

stat(s, FREE_FASTPATH);

} else

__slab_free(s, page, head, tail_obj, cnt, addr);

}

3.4.1：慢速路径释放——__slab_free

慢速分配的路径中，我们从struct kmem_cache_cpu.partical中分，不行的话就从struct kmem_cache_node.partical中分。最后在不行的话就从伙伴系统从分配页。因此，对应释放的时候，也可能释放到struct kmem_cache_cpu.partical或者struct kmem_cache_node.partical。这时候，实际上是要释放的slub，不属于当前的kmem_cache_cpu.page。这种情况很好理解，例如我们要释放的是很久之前分配的slub。这个时候，我们就不能将对象地址添加到kmem_cache.freelist链表中，而要添加到其他的链表中。因为freelist和page一定是对应的

static void __slab_free(struct kmem_cache *s, struct page *page,

void *head, void *tail, int cnt,

unsigned long addr)

//这时，head=tail，是要释放的对象地址

{

void *prior;

int was_frozen;

struct page new;

unsigned long counters;

struct kmem_cache_node *n = NULL;

unsigned long flags;

stat(s, FREE_SLOWPATH);

do {

if (unlikely(n)) {

spin_unlock_irqrestore(&n->list_lock, flags);

n = NULL;

}

prior = page->freelist; //这个page先前的空闲对象

counters = page->counters;

set_freepointer(s, tail, prior); //tail和head是同一个指针，指向了这次要释放的对象。因此，将释放的对象放在freelist的最前面

new.counters = counters;

was_frozen = new.frozen; //在kmem_cache_cpu上的page是被frozen的

new.inuse -= cnt;

if ((!new.inuse || !prior) && !was_frozen) { //inuse=0,表示这个page中所有slub被释放。prior=0，表示这个page中所有对象都被分出去，是全分配

if (kmem_cache_has_cpu_partial(s) && !prior) { //这个page之前是全部分配。全部分配的page不在kmem_cache_cpu和kmem_cache_node的partical链表上，而是被提出来了

* Slab was on no list before and will be

* partially empty

* We can defer the list move and instead

* freeze it.

new.frozen = 1; //设置page的frozen标志。我们还暂时不处理这个page，不将他移动到任何partical链表上

} else { /* Needs to be taken off a list *///现在，page中所有对象都空闲，并且之前不是frozen状态，也就是page在struct kmem_cache->node[node]链表上

n = get_node(s, page_to_nid(page));

* Speculatively acquire the list_lock.

* If the cmpxchg does not succeed then we may

* drop the list_lock without any processing.

* Otherwise the list_lock will synchronize with

* other processors updating the list of slabs.

spin_lock_irqsave(&n->list_lock, flags);

}

} while (!cmpxchg_double_slab(s, page, //将释放的对象设置为page->freelist

prior, counters,

head, new.counters,

"__slab_free"));

if (likely(!n)) { //当page中并非所有slub空闲会走到这儿

if (likely(was_frozen)) {

* The list lock was not taken therefore no list

* activity can be necessary.

*///之前page属于cpu_partial，这里简单更改freelist后，返回

stat(s, FREE_FROZEN);

} else if (new.frozen) {

* If we just froze the page then put it onto the

* per cpu partial list.

put_cpu_partial(s, page, 1); //之前page是全分配，全分配的页不在任何链表上。会走到这里。将page的freelist更改后，并且froze page，再添加到cpu_partial上

stat(s, CPU_PARTIAL_FREE);

}

return;

1：之前page属于node_partical，这次释放后不是全不分配，会走到这里，简单更改page的freelist后返回

2：之前page属于cpu_partical，

}

//只有当这个page中的所有slub都是空闲，并且page在struct kmem_cache->node[node]链表上的时候会走到这里

if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) //之前page属于node_partical，这次释放后是全不分配，并且node中nr_partial 过多，会走到这里，将page释放到伙伴系统

goto slab_empty;

……

spin_unlock_irqrestore(&n->list_lock, flags);

return; //之前page属于node_partical，这次释放后是全不分配，并且node中nr_partial数目不多，会走到这里，简单改freelist后返回

slab_empty:

if (prior) {

* Slab on the partial list.

remove_partial(n, page);

stat(s, FREE_REMOVE_PARTIAL);

} else {

/* Slab must be on the full list */

remove_full(s, n, page); //这个函数什么也不做

}

spin_unlock_irqrestore(&n->list_lock, flags);

stat(s, FREE_SLAB);

discard_slab(s, page);

}

4：vmalloc

我们一般使用vmalloc来分配大块内存。他会首先给内核创建一个struct vm_struct，用于存储虚拟地址空间，然后通过伙伴系统分配物理页面，并建立虚拟地址和物理地址之间的映射。vmalloc的实现如下

void *vmalloc(unsigned long size)

{

return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, //GFP_KERNEL表示在分配过程中，可能会发生内存回收，甚至是进程被阻塞，做直接回收。因此，vmalloc不能使用在原子路径上

__builtin_return_address(0));

}

void *__vmalloc_node(unsigned long size, unsigned long align,

gfp_t gfp_mask, int node, const void *caller)

{

return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,

gfp_mask, PAGE_KERNEL, 0, node, caller);

//VMALLOC_START, VMALLOC_END是内核中，用于vmalloc的虚拟地址空间的起始和结束地址。我们可以通过linux/Documentation/arm64/memory.rst查看这两个值

}

void *__vmalloc_node_range(unsigned long size, unsigned long align,

unsigned long start, unsigned long end, gfp_t gfp_mask,

pgprot_t prot, unsigned long vm_flags, int node,

const void *caller)

{

struct vm_struct *area;

void *addr;

unsigned long real_size = size;

size = PAGE_ALIGN(size);

if (!size || (size >> PAGE_SHIFT) > totalram_pages())

goto fail;

area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | //分配vm_struct，用于存储虚拟地址空间结构体

vm_flags, start, end, node, gfp_mask, caller);

if (!area)

goto fail;

addr = __vmalloc_area_node(area, gfp_mask, prot, node); //分配物理页面，并和vm_struct做映射

if (!addr)

return NULL;

* In this function, newly allocated vm_struct has VM_UNINITIALIZED

* flag. It means that vm_struct is not fully initialized.

* Now, it is fully initialized, so remove this flag here.

clear_vm_uninitialized_flag(area);

return addr;

}

4.1：__get_vm_area_node——分配vm_struct,vmap_area

函数__get_vm_area_node的作用是分配struct vm_struct和struct vmap_area。内核中可能有许多个vm_struct和vmap_area，他们之间是一一对应的关系。为了能够更快的找到某个地址对应的数据结构图，我们会将struct vmap_area组织成为一棵红黑树。函数实现如下：

static struct vm_struct *__get_vm_area_node(unsigned long size,

unsigned long align, unsigned long flags, unsigned long start,

unsigned long end, int node, gfp_t gfp_mask, const void *caller)

{

struct vmap_area *va;

struct vm_struct *area;

unsigned long requested_size = size;

BUG_ON(in_interrupt());

size = PAGE_ALIGN(size);

if (flags & VM_IOREMAP)

align = 1ul << clamp_t(int, get_count_order_long(size),

PAGE_SHIFT, IOREMAP_MAX_ORDER);

area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); //分配struct vm_struct占用的内存

if (!(flags & VM_NO_GUARD)) //如果没有设置VM_NO_GUARD，我们会给size多加一个page，作为保护页

size += PAGE_SIZE;

va = alloc_vmap_area(size, align, start, end, node, gfp_mask); //分配结构体struct vmap_area结构体并初始化。

if (IS_ERR(va)) {

kfree(area);

return NULL;

}

kasan_unpoison_vmalloc((void *)va->va_start, requested_size);

setup_vmalloc_vm(area, va, flags, caller); //struct vmap_area -> vm = struct vm_struct

return area;

}

函数alloc_vmap_area的实现如下所示

static struct vmap_area *alloc_vmap_area(unsigned long size,

unsigned long align,

unsigned long vstart, unsigned long vend,

int node, gfp_t gfp_mask)

{

struct vmap_area *va, *pva;

unsigned long addr;

int purged = 0;

int ret;

gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); //从kmem_cache中分配一个struct vmap_area结构体

if (unlikely(!va))

return ERR_PTR(-ENOMEM);

retry:

……

* If an allocation fails, the "vend" address is

* returned. Therefore trigger the overflow path.

addr = __alloc_vmap_area(size, align, vstart, vend); //这里通过红黑树free_vmap_area_root找到了一个合适的vmap_area，也就是一个满足需求的线性地址空间。这里返回线性地址空间的首地址

spin_unlock(&free_vmap_area_lock);

va->va_start = addr;

va->va_end = addr + size;

va->vm = NULL;

spin_lock(&vmap_area_lock);

insert_vmap_area(va, &vmap_area_root, &vmap_area_list); //类似于空闲的vmap_area会构成free_vmap_area_root红黑树，使用中的vmap_area也会构成vmap_area_root红黑树

spin_unlock(&vmap_area_lock);

return va;

}

static __always_inline unsigned long

__alloc_vmap_area(unsigned long size, unsigned long align,

unsigned long vstart, unsigned long vend)

{

unsigned long nva_start_addr;

struct vmap_area *va;

enum fit_type type;

int ret;

va = find_vmap_lowest_match(size, align, vstart); //这个函数在红黑树free_vmap_area_root中，找到满足size，vstart的空闲虚拟地址空间。free_vmap_area_root是空闲的struct vmap_area构成的红黑树，vmap_area起始地址越小，他在红黑树中的位置越靠左

if (unlikely(!va))

return vend;

if (va->va_start > vstart)

nva_start_addr = ALIGN(va->va_start, align);

else

nva_start_addr = ALIGN(vstart, align);

//上面的nva_start_addr就是我们这次要分配出去的struct vmap_area的起始地址

/* Check the "vend" restriction. */

if (nva_start_addr + size > vend)

return vend;

/* Classify what we have found. */

type = classify_va_fit_type(va, nva_start_addr, size);

if (WARN_ON_ONCE(type == NOTHING_FIT))

return vend;

//我们要分配一段size大小的虚拟地址空间。这时候我们找到了一个struct vmap_area。但是，这个vmap_area和size之间可能不是完全匹配。因此，我们这里获取他的匹配程度，分别是FL_FIT_TYPE(完全匹配)，LE_FIT_TYPE(左匹配)，RE_FIT_TYPE(右匹配)，NE_FIT_TYPE(非边缘匹配)。对于不是完全匹配的struct vmap_area，我们可能需要将剩下的线性地址空间再次插入红黑树free_vmap_area_root中

/* Update the free vmap_area. */

ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); //这里就是我们在上面的注释中描述的逻辑，将剩余的线性地址空间插入红黑树中

if (ret)

return vend;

return nva_start_addr;

}

因此，我们看到，在给vmalloc分配虚拟地址空间的时候，需要用到内核中的free_vmap_area_root数据结构。他存放了系统中所有空闲的vmap_area，也就是系统中，vmalloc可用的空闲虚拟地址空间。而正在使用的虚拟地址空间，存放在红黑树vmap_area_root中。

4.2：__vmalloc_area_node——分配物理页，并建立虚拟地址和物理地址之间的映射

函数__vmalloc_area_node的实现如下

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

pgprot_t prot, int node)

{

const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;

unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;

unsigned int array_size = nr_pages * sizeof(struct page *), i;

struct page **pages; //使用page数组存放后面分出来的page指针

gfp_mask |= __GFP_NOWARN;

if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))

gfp_mask |= __GFP_HIGHMEM;

/* Please note that the recursion is strictly bounded. */

if (array_size > PAGE_SIZE) {

pages = __vmalloc_node(array_size, 1, nested_gfp, node,

area->caller);

} else {

pages = kmalloc_node(array_size, nested_gfp, node);

}

//上面分配page数组使用的内存。可见，如果要分配的数组大小太大的时候，可能会嵌套调用_vmalloc

area->pages = pages;

area->nr_pages = nr_pages;

for (i = 0; i < area->nr_pages; i++) {

struct page *page;

if (node == NUMA_NO_NODE)

page = alloc_page(gfp_mask); //从伙伴系统中分配页

else

page = alloc_pages_node(node, gfp_mask, 0);

area->pages[i] = page;

if (gfpflags_allow_blocking(gfp_mask))

cond_resched();

}

atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), //通过这个函数做vmalloc虚拟地址空间和物理地址空间之间的映射。这里get_vm_area_size很有讲究，它不映射保护页。因此，当内存越界访问到保护页的时候，就会发生缺页异常报错。此外，需要知道的是，我们这里将映射添加到页表swapper_pg_dir中

prot, pages) < 0)

goto fail;

return area->addr;

}

我们看到，上面通过vmalloc更新了页表swapper_pg_dir中的页表项。同时，我们还知道，不管是内核线程，或者是在内核态执行的用户进程，在访问内核地址的时候，都是使用页表swapper_pg_dir。因此，映射之后，所有进程都能够访问这个映射的地址。

5：out of memory

这个函数就是我们熟知的oom流程。在系统内存不足的时候，会选出一个进程杀掉。

/**

* out_of_memory - kill the "best" process when we run out of memory

* @oc: pointer to struct oom_control

* If we run out of memory, we have the choice between either

* killing a random task (bad), letting the system crash (worse)

* OR try to be smart about which process to kill. Note that we

* don't have to be perfect here, we just have to be good.

bool out_of_memory(struct oom_control *oc)

{

unsigned long freed = 0;

if (oom_killer_disabled)

return false;

* If current has a pending SIGKILL or is exiting, then automatically

* select it. The goal is to allow it to allocate so that it may

* quickly exit and free its memory.

if (task_will_free_mem(current)) { //在这里检查current是否正要退出。如果是的话，就选择杀死current进程

mark_oom_victim(current);

wake_oom_reaper(current); //唤醒内核线程oom_reaper处理

return true;

}

* The OOM killer does not compensate for IO-less reclaim.

* pagefault_out_of_memory lost its gfp context so we have to

* make sure exclude 0 mask - all other users should have at least

* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to

* invoke the OOM killer even if it is a GFP_NOFS allocation.

if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))

return true;

* Check if there were limitations on the allocation (only relevant for

* NUMA and memcg) that may require different handling.

oc->constraint = constrained_alloc(oc);

if (oc->constraint != CONSTRAINT_MEMORY_POLICY)

oc->nodemask = NULL;

check_panic_on_oom(oc); //检查发生oom的时候，是否要做系统panic。一般不做

if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && //如果设置了sysctl_oom_kill_allocating_task，则发生oom的时候，会将当前要内存分配的进程杀掉。这个值可以通过/proc/sys/vm/oom_kill_allocating_task设置

current->mm && !oom_unkillable_task(current) &&

oom_cpuset_eligible(current, oc) &&

current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {

get_task_struct(current);

oc->chosen = current;

oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); //这时候，杀死导致内存分配失败的进程，而不是选择一个进程杀死

return true;

}

select_bad_process(oc); //一般流程，我们选择一个进程杀死

/* Found nothing?!?! */

if (!oc->chosen) {

dump_header(oc, NULL);

pr_warn("Out of memory and no killable processes...\n");

* If we got here due to an actual allocation at the

* system level, we cannot survive this and will enter

* an endless loop in the allocator. Bail out now.

if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))

panic("System is deadlocked on memory\n");

}

if (oc->chosen && oc->chosen != (void *)-1UL)

oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : //杀死我们选择的进程

"Memory cgroup out of memory");

return !!oc->chosen;

}

5.1：select_bad_process——选择一个进程杀死

一般情况下，我们会找到系统中一个合适的进程杀掉来解决oom。

* Simple selection loop. We choose the process with the highest number of

* 'points'. In case scan was aborted, oc->chosen is set to -1.

static void select_bad_process(struct oom_control *oc)

{

oc->chosen_points = LONG_MIN;

if (is_memcg_oom(oc))

mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);

else {

struct task_struct *p;

rcu_read_lock();

for_each_process(p) //遍历系统中的所有进程。通过task_struct -> tasks.next完成遍历

if (oom_evaluate_task(p, oc))

break;

rcu_read_unlock();

}

static int oom_evaluate_task(struct task_struct *task, void *arg)

{

struct oom_control *oc = arg;

long points;

if (oom_unkillable_task(task)) //内核线程，和进程1属于同一个线程组的进程不能杀

goto next;

……

points = oom_badness(task, oc->totalpages); //计算当前进程的oom得分

if (points == LONG_MIN || points < oc->chosen_points) //我们最终杀死得分最高的那个进程

goto next;

select:

if (oc->chosen)

put_task_struct(oc->chosen);

get_task_struct(task);

oc->chosen = task;

oc->chosen_points = points;

return 0;

abort:

if (oc->chosen)

put_task_struct(oc->chosen);

oc->chosen = (void *)-1UL;

return 1;

}

long oom_badness(struct task_struct *p, unsigned long totalpages)

{

long points;

long adj;

if (oom_unkillable_task(p))

return LONG_MIN;

p = find_lock_task_mm(p);

if (!p)

return LONG_MIN;

* Do not even consider tasks which are explicitly marked oom

* unkillable or have been already oom reaped or the are in

* the middle of vfork

adj = (long)p->signal->oom_score_adj; //可以通过/proc/PID/oom_score_adj查看

if (adj == OOM_SCORE_ADJ_MIN || //oom_score_adj设置为-1000的进程不会被oom杀死

test_bit(MMF_OOM_SKIP, &p->mm->flags) ||

in_vfork(p)) {

task_unlock(p);

return LONG_MIN;

}

* The baseline for the badness score is the proportion of RAM that each

* task's rss, pagetable and swap space use.

points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +

mm_pgtables_bytes(p->mm) / PAGE_SIZE; //计算进程使用的页数

task_unlock(p);

/* Normalize to oom_score_adj units */

adj *= totalpages / 1000;

points += adj; //将分数做处理后，返回分数

return points;

}

每个进程的oom得分数还可以通过/proc/pid/oom_score查看。