第8章补:Linux-5.10.110中的内存分配

        第8章描述了Linux-2.6.11中的内核内存分配。这一章补充描述Linux-5.10.110中的内存分配。主要从三个方面描述。首先是伙伴系统,我们已经看到,他是内核中内存分配的基础,不管是slab还是vmalloc,都需要从伙伴系统中分配内存。然后我们会描述kmem_cache和kmalloc,他们是内核中最常使用的分配内存的手段。最后,我们会描述vmalloc。与kmalloc相比,它主要用于分配大块的内存。

1:数据结构

        这一节描述内存管理模块使用的结构体。

1.1:节点node和管理区zone

        在现代的计算机内存中,有节点node这一概念。每个节点node又包含了几个内存区zone。每个节点使用结构体struct pglist_data描述。因此,在NUMA系统中,有多个节点,多个节点的struct pglist_data就会组成数组

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;

而在UMA模型中,只有一个节点,因此使用

struct pglist_data __refdata contig_page_data;

来描述。总之,结构体struct pglist_data描述了一个内存结点。

/*

 * On NUMA machines, each NUMA node would have a pg_data_t to describe

 * it's memory layout. On UMA machines there is a single pglist_data which

 * describes the whole memory.

 *

 * Memory statistics and page replacement data structures are maintained on a

 * per-zone basis.

 */

typedef struct pglist_data {

/*

 * node_zones contains just the zones for THIS node. Not all of the

 * zones may be populated, but it is the full list. It is referenced by

 * this node's node_zonelists as well as other node's node_zonelists.

 *///这个节点包含的所有管理区

struct zone node_zones[MAX_NR_ZONES];         

/*

 * node_zonelists contains references to all zones in all nodes.

 * Generally the first zones will be references to this node's

 * node_zones.

 */

/*

在UMA中,node_zonelists包含了一个个struct zonelist。而struct zonelist是系统中所有的zone构成的数组

struct zonelist {

struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];

};

*/

struct zonelist node_zonelists[MAX_ZONELISTS];     

int nr_zones; /* number of populated zones in this node */

#ifdef CONFIG_FLAT_NODE_MEM_MAP        /* means !SPARSEMEM */

struct page *node_mem_map;

#ifdef CONFIG_PAGE_EXTENSION

struct page_ext *node_page_ext;

#endif

#endif

#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)

/*

 * Must be held any time you expect node_start_pfn,

 * node_present_pages, node_spanned_pages or nr_zones to stay constant.

 * Also synchronizes pgdat->first_deferred_pfn during deferred page

 * init.

 *

 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to

 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG

 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.

 *

 * Nests above zone->lock and zone->span_seqlock

 */

spinlock_t node_size_lock;

#endif

unsigned long node_start_pfn;

unsigned long node_present_pages; /* total number of physical pages */

unsigned long node_spanned_pages; /* total size of physical page

     range, including holes */

int node_id;

wait_queue_head_t kswapd_wait;

wait_queue_head_t pfmemalloc_wait;

struct task_struct *kswapd;        /* Protected by

   mem_hotplug_begin/end() */

int kswapd_order;

enum zone_type kswapd_highest_zoneidx;

int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION

int kcompactd_max_order;

enum zone_type kcompactd_highest_zoneidx;

wait_queue_head_t kcompactd_wait;

struct task_struct *kcompactd;

#endif

/*

 * This is a per-node reserve of pages that are not available

 * to userspace allocations.

 */

unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA

/*

 * node reclaim becomes active if more unmapped pages exist.

 */

unsigned long                min_unmapped_pages;

unsigned long                min_slab_pages;

#endif /* CONFIG_NUMA */

/* Write-intensive fields used by page reclaim */

ZONE_PADDING(_pad1_)

spinlock_t                lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

/*

 * If memory initialisation on large machines is deferred then this

 * is the first PFN that needs to be initialised.

 */

unsigned long first_deferred_pfn;

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

struct deferred_split deferred_split_queue;

#endif

/* Fields commonly accessed by the page reclaim scanner */

/*

 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.

 *

 * Use mem_cgroup_lruvec() to look up lruvecs.

 */

struct lruvec                __lruvec;

unsigned long                flags;

ZONE_PADDING(_pad2_)

/* Per-node vmstats */

struct per_cpu_nodestat __percpu *per_cpu_nodestats;

atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];

} pg_data_t;

        一个节点中包含了多个内存区。内存区使用结构体struct zone描述

struct zone {

/* Read-mostly fields */

/* zone watermarks, access with *_wmark_pages(zone) macros */

unsigned long _watermark[NR_WMARK];

unsigned long watermark_boost;

unsigned long nr_reserved_highatomic;

/*

 * We don't know if the memory that we're going to allocate will be

 * freeable or/and it will be released eventually, so to avoid totally

 * wasting several GB of ram we must reserve some of the lower zone

 * memory (otherwise we risk to run OOM on the lower zones despite

 * there being tons of freeable ram on the higher zones).  This array is

 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl

 * changes.

 */

long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NEED_MULTIPLE_NODES

int node;

#endif

struct pglist_data        *zone_pgdat;

struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM

/*

 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.

 * In SPARSEMEM, this map is stored in struct mem_section

 */

unsigned long                *pageblock_flags;

#endif /* CONFIG_SPARSEMEM */

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

unsigned long                zone_start_pfn;

/*

 * spanned_pages is the total pages spanned by the zone, including

 * holes, which is calculated as:

 *         spanned_pages = zone_end_pfn - zone_start_pfn;

 *

 * present_pages is physical pages existing within the zone, which

 * is calculated as:

 *        present_pages = spanned_pages - absent_pages(pages in holes);

 *

 * managed_pages is present pages managed by the buddy system, which

 * is calculated as (reserved_pages includes pages allocated by the

 * bootmem allocator):

 *        managed_pages = present_pages - reserved_pages;

 *

 * So present_pages may be used by memory hotplug or memory power

 * management logic to figure out unmanaged pages by checking

 * (present_pages - managed_pages). And managed_pages should be used

 * by page allocator and vm scanner to calculate all kinds of watermarks

 * and thresholds.

 *

 * Locking rules:

 *

 * zone_start_pfn and spanned_pages are protected by span_seqlock.

 * It is a seqlock because it has to be read outside of zone->lock,

 * and it is done in the main allocator path.  But, it is written

 * quite infrequently.

 *

 * The span_seq lock is declared along with zone->lock because it is

 * frequently read in proximity to zone->lock.  It's good to

 * give them a chance of being in the same cacheline.

 *

 * Write access to present_pages at runtime should be protected by

 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of

 * present_pages should get_online_mems() to get a stable value.

 */

atomic_long_t                managed_pages;

unsigned long                spanned_pages;

unsigned long                present_pages;

const char                *name;

int initialized;

/* Write-intensive fields used from the page allocator */

ZONE_PADDING(_pad1_)

/* free areas of different sizes */

struct free_area        free_area[MAX_ORDER];

/* zone flags, see below */

unsigned long                flags;

/* Primarily protects free_area */

spinlock_t                lock;

/* Write-intensive fields used by compaction and vmstats. */

ZONE_PADDING(_pad2_)

/*

 * When free pages are below this point, additional steps are taken

 * when reading the number of free pages to avoid per-cpu counter

 * drift allowing watermarks to be breached

 */

unsigned long percpu_drift_mark;

bool                        contiguous;

ZONE_PADDING(_pad3_)

/* Zone statistics */

atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];

atomic_long_t                vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];

} ____cacheline_internodealigned_in_smp;

1.2:struct alloc_context

        结构体struct alloc_context用于描述从伙伴系统分配物理内存的行为。

/*

 * in mm/page_alloc.c

 */

/*

 * Structure for holding the mostly immutable allocation parameters passed

 * between functions involved in allocations, including the alloc_pages*

 * family of functions.

 *

 * nodemask, migratetype and highest_zoneidx are initialized only once in

 * __alloc_pages_nodemask() and then never change.

 *

 * zonelist, preferred_zone and highest_zoneidx are set first in

 * __alloc_pages_nodemask() for the fast path, and might be later changed

 * in __alloc_pages_slowpath(). All other functions pass the whole structure

 * by a const pointer.

 */

struct alloc_context {

struct zonelist *zonelist;

nodemask_t *nodemask;

struct zoneref *preferred_zoneref;

int migratetype;

/*

 * highest_zoneidx represents highest usable zone index of

 * the allocation request. Due to the nature of the zone,

 * memory on lower zone than the highest_zoneidx will be

 * protected by lowmem_reserve[highest_zoneidx].

 *

 * highest_zoneidx is also used by reclaim/compaction to limit

 * the target zone since higher zone than this index cannot be

 * usable for this allocation request.

 */

enum zone_type highest_zoneidx;

bool spread_dirty_pages;

};

2:伙伴系统

        伙伴系统的对外接口是alloc_page或者alloc_pages。它的作用是,指定分配的页数目(1<<order)和分配的标志位,分配连续的物理内存页,并且返回首页的page结构。

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)

{

return alloc_pages_node(numa_node_id(), gfp_mask, order);              //UMA系统中,numa_node_id()返回0

}

/*

 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,

 * prefer the current CPU's closest node. Otherwise node must be valid and

 * online.

 */

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,

unsigned int order)

{

if (nid == NUMA_NO_NODE)

nid = numa_mem_id();

return __alloc_pages_node(nid, gfp_mask, order);

}

/*

 * Allocate pages, preferring the node given as nid. The node must be valid and

 * online. For more general interface, see alloc_pages_node().

 */

static inline struct page *

__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)

{

VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);

VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));

return __alloc_pages(gfp_mask, order, nid);

}

static inline struct page *

__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)

{

return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);

}

        上面只是简单的调用流程,没有有价值的地方。我们终于进入了函数__alloc_pages_nodemask。这是伙伴系统的核心。

2.1:伙伴系统的核心——__alloc_pages_nodemask

        函数__alloc_pages_nodemask的实现如下:

/*

 * This is the 'heart' of the zoned buddy allocator.

 */

struct page *

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,

nodemask_t *nodemask)

{

struct page *page;

unsigned int alloc_flags = ALLOC_WMARK_LOW;

gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

struct alloc_context ac = { };

/*

 * There are several places where we assume that the order value is sane

 * so bail out early if the request is out of bound.

 */

if (unlikely(order >= MAX_ORDER)) {

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

return NULL;

}

gfp_mask &= gfp_allowed_mask;

alloc_mask = gfp_mask;

if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))                 //初始化struct alloc_context结构体

return NULL;

/*

 * Forbid the first pass from falling back to types that fragment

 * memory until all local zones are considered.

 */

alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);

/* First allocation attempt *///第一次分配,快速分配

page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

if (likely(page))

goto out;

/*

 * Apply scoped allocation constraints. This is mainly about GFP_NOFS

 * resp. GFP_NOIO which has to be inherited for all allocation requests

 * from a particular context which has been marked by

 * memalloc_no{fs,io}_{save,restore}.

 */

alloc_mask = current_gfp_context(gfp_mask);

ac.spread_dirty_pages = false;

/*

 * Restore the original nodemask if it was potentially replaced with

 * &cpuset_current_mems_allowed to optimize the fast-path attempt.

 */

ac.nodemask = nodemask;

page = __alloc_pages_slowpath(alloc_mask, order, &ac);

out:

return page;

}

        这个函数主要包含了三个部分:

        1:prepare_alloc_pages       

        2:get_page_from_freelist

        3:__alloc_pages_slowpath

2.2:prepare_alloc_pages

        这个函数的实现如下,它主要填充了结构体struct alloc_context

/*

 * in mm/page_alloc.c

 */

/*

 * Structure for holding the mostly immutable allocation parameters passed

 * between functions involved in allocations, including the alloc_pages*

 * family of functions.

 *

 * nodemask, migratetype and highest_zoneidx are initialized only once in

 * __alloc_pages_nodemask() and then never change.

 *

 * zonelist, preferred_zone and highest_zoneidx are set first in

 * __alloc_pages_nodemask() for the fast path, and might be later changed

 * in __alloc_pages_slowpath(). All other functions pass the whole structure

 * by a const pointer.

 *///从注释中,我们知道,成员zonelist, preferred_zone and highest_zoneidx在快速分配和慢速分配中,会发生改变

struct alloc_context {

struct zonelist *zonelist;

nodemask_t *nodemask;

struct zoneref *preferred_zoneref;

int migratetype;

/*

 * highest_zoneidx represents highest usable zone index of

 * the allocation request. Due to the nature of the zone,

 * memory on lower zone than the highest_zoneidx will be

 * protected by lowmem_reserve[highest_zoneidx].

 *

 * highest_zoneidx is also used by reclaim/compaction to limit

 * the target zone since higher zone than this index cannot be

 * usable for this allocation request.

 *///这个枚举变量的取值可能是ZONE_DMAZONE_DMA32ZONE_NORMALZONE_HIGHMEMZONE_MOVABLE。这个值限制了可以使用的zoneidx最大值

enum zone_type highest_zoneidx;

bool spread_dirty_pages;

};

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

int preferred_nid, nodemask_t *nodemask,

struct alloc_context *ac, gfp_t *alloc_mask,

unsigned int *alloc_flags)

{

ac->highest_zoneidx = gfp_zone(gfp_mask);         //根据gfp_mask的最后4bit,我们可以获得从哪些内存区中分配内存。例如,GFP_HIGHUSER_MOVABLE包含了__GFP_HIGHMEM| __GFP_MOVABLE。在GFP_ZONE_TABLE中,对应了ZONE_MOVABLE。也就是说,分配内存的时候,内存区最高使用到MOVABLE

ac->zonelist = node_zonelist(preferred_nid, gfp_mask);      //每个节点nodestruct zonelist node_zonelists[MAX_ZONELISTS]成员都存放了所有的内存区的zone数据,但是每个nodenode_zonelists最开始都是自己的zone。并且每个节点nodestruct zonelist node_zonelists[MAX_ZONELISTS]是从高级zone向低级排的,也就是说,这里zonelist可能是这个nodeNORMAL,DMA32,DMA这个顺序

ac->nodemask = nodemask;

ac->migratetype = gfp_migratetype(gfp_mask);       //根据gfp_mask中是否设置__GFP_RECLAIMABLE__GFP_MOVABLE,设置migratetypeenum migratetype中的值

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

if (should_fail_alloc_page(gfp_mask, order))

return false;

*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);

/* Dirty zone balancing only done in the fast path */

ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

/*

 * The preferred zone is used for statistics but crucially it is

 * also used as the starting point for the zonelist iterator. It

 * may get reset for allocations that ignore memory policies.

 */

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,    //我们拿到最开始进行分配的zone。要求这个zonerefzone_idx <= highest_zoneidx。例如,我们使用GFP_KERNEL宏,我们会按照NORMALDMA32DMA的顺序分配,所以这里拿到的是NORMAL

ac->highest_zoneidx, ac->nodemask);

return true;

}

2.3:get_page_from_freelist

        get_page_from_freelist是我们第一次分配。说明一下各参数的含义。gfp_mask:我们传入的gfp_mask;order:我们传入的order;alloc_flags:包含水位,能否做内存回收等信息;ac:包含要使用的内存区信息等。函数prepare_alloc_pages实现如下:

static struct page *

get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

const struct alloc_context *ac)

{

struct zoneref *z;

struct zone *zone;

struct pglist_data *last_pgdat_dirty_limit = NULL;

bool no_fallback;

retry:

/*

 * Scan zonelist, looking for a zone with enough free.

 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.

 */

no_fallback = alloc_flags & ALLOC_NOFRAGMENT;

z = ac->preferred_zoneref;

for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,         //这里遍历所有能够做内存分配的zone

ac->nodemask) {

struct page *page;

unsigned long mark;

……//这部分内容我们不关注

mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);//我们获得这次分配使用的水位线。在快速分配的时候,我们使用的是LOW水位线。这时候,mark是页数

if (!zone_watermark_fast(zone, order, mark,         //这里判断zone能够满足分配条件。满足的话返回true,从而进入rmqueue进行内存分配。判断的标准就是,分配完成后,剩下的内存应该大于LOW水线的一个相关值

       ac->highest_zoneidx, alloc_flags,

       gfp_mask)) {

int ret;

/* Checked here to keep the fast path fast */

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

if (alloc_flags & ALLOC_NO_WATERMARKS)

goto try_this_zone;

continue;          //我们假设我们是UMA模型。这种情况下一定会走到continue

……           //NUMA模型中,后面的流程无效,我们不关注

}

try_this_zone://流程走到这里,表示我们已经找到了一个满足水位要求的zone,或者我们设置了ALLOC_NO_WATERMARKS,也就是不考虑水位

page = rmqueue(ac->preferred_zoneref->zone, zone, order,            //使用rmqueue从伙伴系统中分配内存

gfp_mask, alloc_flags, ac->migratetype);

if (page) {

prep_new_page(page, order, gfp_mask, alloc_flags);        //设置page->_refcount=1,清->private=0

/*

 * If this is a high-order atomic allocation then check

 * if the pageblock should be reserved for the future

 */

if (unlikely(order && (alloc_flags & ALLOC_HARDER)))

reserve_highatomic_pageblock(page, zone, order);

return page;

}       //我们选择了一个内存区,但是这个内存区也可能分不出来内存。分不出来就继续做大循环

}

//我们遍历了所有的zone,还是没能找到合适的内存区,返回NULL

/*

 * It's possible on a UMA machine to get through all zones that are

 * fragmented. If avoiding fragmentation, reset and try again.

 */

if (no_fallback) {

alloc_flags &= ~ALLOC_NOFRAGMENT;

goto retry;

}

return NULL;

}

        因此,在第一次快速分配的时候,我们扫描zone,看他是否满足条件。如果满足的话,就通过函数rmqueue进行分配。否则返回NULL。而这里是否满足条件,和水线LOW有强相关关系。当水线很高的时候,很可能我们不能通过快速路径分出来内存。

        注意,在快速路径中,我们不会进行内存回收。

2.4:__alloc_pages_slowpath

        如果我们在第一次快速分配的时候,没有成功,那么就会进行慢速分配的流程。这里的参数,gfp_mask:我们传入的gfp_mask;order:我们传入的order;ac:包含要使用的内存区信息等。

static inline struct page *

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

struct alloc_context *ac)

{

bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;

const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;   //order > 3?order比较小的时候,内存分配不受内存碎片的干扰,因此无法分配内存的时候,不应该尝试内存压缩(迁移)

struct page *page = NULL;

unsigned int alloc_flags;

unsigned long did_some_progress;

enum compact_priority compact_priority;

enum compact_result compact_result;

int compaction_retries;

int no_progress_loops;

unsigned int cpuset_mems_cookie;

int reserve_flags;

/*

 * We also sanity check to catch abuse of atomic reserves being used by

 * callers that are not in atomic context.

 *///__GFP_ATOMIC表示分配内存是原子操作。在此过程中不能进行内存回收

if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==

(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))

gfp_mask &= ~__GFP_ATOMIC;

retry_cpuset:

compaction_retries = 0;

no_progress_loops = 0;

compact_priority = DEF_COMPACT_PRIORITY;

/*

 * The fast path uses conservative alloc_flags to succeed only until

 * kswapd needs to be woken up, and to avoid the cost of setting up

 * alloc_flags precisely. So we do that now.

 *///相较于快速路径,慢速路径中对alloc_flags做了以下修改:1:水位值使用MIN2:如果gfp_mask设置__GFP_HIGH(高优先级)__GFP_KSWAPD_RECLAIM(允许唤醒kswapd),在alloc_flags中设置相应的位;3:如果gfp_mask设置__GFP_ATOMIC或者当前进程是实时进程,且不在中断中,设置ALLOC_HARDER

alloc_flags = gfp_to_alloc_flags(gfp_mask);

/*

 * We need to recalculate the starting point for the zonelist iterator

 * because we might have used different nodemask in the fast path, or

 * there was a cpuset modification and we are retrying - otherwise we

 * could end up iterating over non-eligible zones endlessly.

 *///我们重新设置preferred_zoneref的值

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

ac->highest_zoneidx, ac->nodemask);

if (!ac->preferred_zoneref->zone)

goto nopage;

if (alloc_flags & ALLOC_KSWAPD)//如果gfp_mask设置了__GFP_KSWAPD_RECLAIM,这里就会唤醒kswapd回收内存。注意有__GFP_KSWAPD_RECLAIM(通过kswapd)或者__GFP_RECLAIM(直接回收两种形式)

wake_all_kswapds(order, gfp_mask, ac);

/*

 * The adjusted alloc_flags might result in immediate success, so try

 * that first

 *///我们使用了MIN水位,并且唤醒了kswapd。这里直接分配一下,看能不能分出来

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

if (page)

goto got_pg;

/*

 * For costly allocations, try direct compaction first, as it's likely

 * that we have enough base pages and don't need to reclaim. For non-

 * movable high-order allocations, do that as well, as compaction will

 * try prevent permanent fragmentation by migrating from blocks of the

 * same migratetype.

 * Don't try this for allocations that are allowed to ignore

 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.

 *///下面都是和内存压缩有关的代码。我们暂时不关注内存压缩

if (can_direct_reclaim &&

(costly_order ||

   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))

&& !gfp_pfmemalloc_allowed(gfp_mask)) {

page = __alloc_pages_direct_compact(gfp_mask, order,

alloc_flags, ac,

INIT_COMPACT_PRIORITY,

&compact_result);

if (page)

goto got_pg;

/*

 * Checks for costly allocations with __GFP_NORETRY, which

 * includes some THP page fault allocations

 */

if (costly_order && (gfp_mask & __GFP_NORETRY)) {

/*

 * If allocating entire pageblock(s) and compaction

 * failed because all zones are below low watermarks

 * or is prohibited because it recently failed at this

 * order, fail immediately unless the allocator has

 * requested compaction and reclaim retry.

 *

 * Reclaim is

 *  - potentially very expensive because zones are far

 *    below their low watermarks or this is part of very

 *    bursty high order allocations,

 *  - not guaranteed to help because isolate_freepages()

 *    may not iterate over freed pages as part of its

 *    linear scan, and

 *  - unlikely to make entire pageblocks free on its

 *    own.

 */

if (compact_result == COMPACT_SKIPPED ||

    compact_result == COMPACT_DEFERRED)

goto nopage;

/*

 * Looks like reclaim/compaction is worth trying, but

 * sync compaction could be very expensive, so keep

 * using async compaction.

 */

compact_priority = INIT_COMPACT_PRIORITY;

}

}

retry://我们已经使用了MIN水位。但是还是分不出来内存

/* Ensure kswapd doesn't accidentally go to sleep as long as we loop *///这里再唤醒一下kswapd进程

if (alloc_flags & ALLOC_KSWAPD)

wake_all_kswapds(order, gfp_mask, ac);

reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);//如果当前进程是要释放内存的进程,那么会设置ALLOC_NO_WATERMARKS,不再使用水位线

if (reserve_flags)

alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);

/*

 * Reset the nodemask and zonelist iterators if memory policies can be

 * ignored. These allocations are high priority and system rather than

 * user oriented.

 */

if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {

ac->nodemask = NULL;

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

ac->highest_zoneidx, ac->nodemask);

}

/* Attempt with potentially adjusted zonelist and alloc_flags *///再次更改水位线,或者使用kswapd释放内存后,再次分配内存

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

if (page)

goto got_pg;

/* Caller is not willing to reclaim, we can't balance anything *///这个进程不愿意直接回收内存

if (!can_direct_reclaim)

goto nopage;

/* Avoid recursion of direct reclaim *///这个进程就是回收内存的进程。我们避免他一直重复在回收内存的过程中

if (current->flags & PF_MEMALLOC)

goto nopage;

/* Try direct reclaim and then allocating */

page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,//我们直接回收内存后,看能否分配出内存

&did_some_progress);

if (page)

goto got_pg;

/* Try direct compaction and then allocating */

page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,compact_priority, &compact_result);

if (page)

goto got_pg;

/* Do not loop if specifically requested *///经过内存回收以及压缩后,我们还是无法成功分配内存。如果gfp_mask & __GFP_NORETRY,就不会做oom kill等操作

if (gfp_mask & __GFP_NORETRY)

goto nopage;

/*

 * Do not retry costly high order allocations unless they are

 * __GFP_RETRY_MAYFAIL

 *///

if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))

goto nopage;

if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,

 did_some_progress > 0, &no_progress_loops))

goto retry;                          //重新做reclaim作。这里面会判断no_progress_loops是否大于MAX_RECLAIM_RETRIES(16)?如果是的话,已经做了16次没有进展的回收和压缩操作了,进OOM

/*

 * It doesn't make any sense to retry for the compaction if the order-0

 * reclaim is not able to make any progress because the current

 * implementation of the compaction depends on the sufficient amount

 * of free memory (see __compaction_suitable)

 */

if (did_some_progress > 0 &&

should_compact_retry(ac, order, alloc_flags,               //重新做compact操作

compact_result, &compact_priority,

&compaction_retries))

goto retry;

/* Reclaim has failed us, start killing things *///oom_killer机制

page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);

if (page)

goto got_pg;

/* Avoid allocations with no watermarks from looping endlessly */

if (tsk_is_oom_victim(current) &&

    (alloc_flags & ALLOC_OOM ||

     (gfp_mask & __GFP_NOMEMALLOC)))

goto nopage;

/* Retry as long as the OOM killer is making progress *///oom已经杀死了一些进程。我们这时候再次尝试内存分配

if (did_some_progress) {

no_progress_loops = 0;

goto retry;

}

nopage:

/*

 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure

 * we always retry

 */

if (gfp_mask & __GFP_NOFAIL) {    //设置了__GFP_NOFAIL,我们最终还会继续retry,始终不会返回NULL

/*

 * All existing users of the __GFP_NOFAIL are blockable, so warn

 * of any new users that actually require GFP_NOWAIT

 */

if (WARN_ON_ONCE(!can_direct_reclaim))

goto fail;

/*

 * PF_MEMALLOC request from this context is rather bizarre

 * because we cannot reclaim anything and only can loop waiting

 * for somebody to do a work for us

 */

WARN_ON_ONCE(current->flags & PF_MEMALLOC);

/*

 * non failing costly orders are a hard requirement which we

 * are not prepared for much so let's warn about these users

 * so that we can identify them and convert them to something

 * else.

 */

WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

/*

 * Help non-failing allocations by giving them access to memory

 * reserves but do not use ALLOC_NO_WATERMARKS because this

 * could deplete whole memory reserves which would just make

 * the situation worse

 */

page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);

if (page)

goto got_pg;

cond_resched();

goto retry;

}

fail:

warn_alloc(gfp_mask, ac->nodemask,

"page allocation failure: order:%u", order);

got_pg:

return page;

}

        因此,在慢速流程中,我们会通过kswapd,或者直接进行内存回收(前提是设置了__GFP_DIRECT_RECLAIM)

3:kmalloc

        我们已经无需说明kmalloc的功能了。直接看它的实现。

/**

 * kmalloc - allocate memory

 * @size: how many bytes of memory are required.

 * @flags: the type of memory to allocate.

 *

 * kmalloc is the normal method of allocating memory

 * for objects smaller than page size in the kernel.

 *

 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN

 * bytes. For @size of power of two bytes, the alignment is also guaranteed

 * to be at least to the size.

 *

 * The @flags argument may be one of the GFP flags defined at

 * include/linux/gfp.h and described at

 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`

 *

 * The recommended usage of the @flags is described at

 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`

 *

 * Below is a brief outline of the most useful GFP flags

 *

 * %GFP_KERNEL

 *        Allocate normal kernel ram. May sleep.

 *

 * %GFP_NOWAIT

 *        Allocation will not sleep.

 *

 * %GFP_ATOMIC

 *        Allocation will not sleep.  May use emergency pools.

 *

 * %GFP_HIGHUSER

 *        Allocate memory from high memory on behalf of user.

 *

 * Also it is possible to set different flags by OR'ing

 * in one or more of the following additional @flags:

 *

 * %__GFP_HIGH

 *        This allocation has high priority and may use emergency pools.

 *

 * %__GFP_NOFAIL

 *        Indicate that this allocation is in no way allowed to fail

 *        (think twice before using).

 *

 * %__GFP_NORETRY

 *        If memory is not immediately available,

 *        then give up at once.

 *

 * %__GFP_NOWARN

 *        If allocation fails, don't issue any warnings.

 *

 * %__GFP_RETRY_MAYFAIL

 *        Try really hard to succeed the allocation but fail

 *        eventually.

 */

static __always_inline void *kmalloc(size_t size, gfp_t flags)

{

return __kmalloc(size, flags);

}

void *__kmalloc(size_t size, gfp_t flags)

{

struct kmem_cache *s;

void *ret;

if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))     //KMALLOC_MAX_CACHE_SIZE= 1 >> 13

return kmalloc_large(size, flags);

s = kmalloc_slab(size, flags);          //根据size,选择一个合适的slub

if (unlikely(ZERO_OR_NULL_PTR(s)))

return s;

ret = slab_alloc(s, flags, _RET_IP_);

trace_kmalloc(_RET_IP_, ret, size, s->size, flags);

ret = kasan_kmalloc(s, ret, size, flags);

return ret;

}

        系统中,和slub相关的数据结构关系图如下所示:

        这张图描述了很多的信息。

        1:kmem_cache是slub的管理结构。

        2:slub作为伙伴系统的缓存,它自身也分成多级,kmem_cache_cpu.page->kmem_cache_cpu.partical->kmem_cache.node.partical。可以认为,分配内存的时候首先从kmem_cache_cpu.page中分配;分不出来,找这个cpu对应的kmem_cache_cpu.partical,将一个partical中包含的page赋值给kmem_cache_cpu.page,做分配;还是分不出来,找到这个node对应的kmem_cache.node.partical,将最开始有空闲对象的page赋值给kmem_cache_cpu.page,再额外将一些page添加到kmem_cache_cpu.partical中。如果还是分不出来,那么就从伙伴系统中做分配。

        3:作为kmem_cache_cpu.page,它的成员具有以下特点:page->inuse=page->objects;page->frozen=1;page->freelist=NULL。

        4:作为kmem_cache_cpu.partical,它的成员具有以下特点:page->inuse是真正使用了的对象数目;page->frozen=1;page->freelist是第一个空闲对象的地址。

3.1:创建一个slub

        使用函数kmem_cache_create创建一个slub。这个函数只创建并初始化了描述slub的struct kmem_cache结构体,还并没有真正的从伙伴系统中给slub分配内存。我们首先看一下struct kmem_cache的数据结构

/*

 * Slab cache management.

 */

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

/* Used for retrieving partial slabs, etc. */

slab_flags_t flags;

unsigned long min_partial;

unsigned int size;        /* The size of an object including metadata */

unsigned int object_size;/* The size of an object without metadata */

struct reciprocal_value reciprocal_size;

unsigned int offset;        /* Free pointer offset */

#ifdef CONFIG_SLUB_CPU_PARTIAL

/* Number of per cpu partial objects to keep around */

unsigned int cpu_partial;

#endif

struct kmem_cache_order_objects oo;

/* Allocation and freeing of slabs */

struct kmem_cache_order_objects max;

struct kmem_cache_order_objects min;

gfp_t allocflags;        /* gfp flags to use on each alloc */

int refcount;                /* Refcount for slab cache destroy */

void (*ctor)(void *);

unsigned int inuse;                /* Offset to metadata */

unsigned int align;                /* Alignment */

unsigned int red_left_pad;        /* Left redzone padding size */

const char *name;        /* Name (only for display!) */

struct list_head list;        /* List of slab caches */

unsigned int useroffset;        /* Usercopy region offset */

unsigned int usersize;                /* Usercopy region size */

struct kmem_cache_node *node[MAX_NUMNODES];

};

函数kmem_cache_create实现如下:

/**

 * kmem_cache_create - Create a cache.

 * @name: A string which is used in /proc/slabinfo to identify this cache.//slabinfo中显示的名字

 * @size: The size of objects to be created in this cache.//slab中每个对象的大小

 * @align: The required alignment for the objects.//slab中每个对象的对其要求

 * @flags: SLAB flags//下面有可能的取值以及它们的含义,一般无需关注

 * @ctor: A constructor for the objects.//当一个页被分配给slub时,执行的初始化函数

 *

 * Cannot be called within a interrupt, but can be interrupted.

 * The @ctor is run when new pages are allocated by the cache.

 *

 * The flags are

 *

 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)

 * to catch references to uninitialised memory.

 *

 * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check

 * for buffer overruns.

 *

 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware

 * cacheline.  This can be beneficial if you're counting cycles as closely

 * as davem.

 *

 * Return: a pointer to the cache on success, NULL on failure.

 */

struct kmem_cache *

kmem_cache_create(const char *name, unsigned int size, unsigned int align,

slab_flags_t flags, void (*ctor)(void *))

{

return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,

  ctor);

}

struct kmem_cache *

kmem_cache_create_usercopy(const char *name,

  unsigned int size, unsigned int align,

  slab_flags_t flags,

  unsigned int useroffset, unsigned int usersize,

  void (*ctor)(void *))

{

struct kmem_cache *s = NULL;

const char *cache_name;

int err;

get_online_cpus();

get_online_mems();

mutex_lock(&slab_mutex);

/* Refuse requests with allocator specific flags */

if (flags & ~SLAB_FLAGS_PERMITTED) {

err = -EINVAL;

goto out_unlock;

}

/*

 * Some allocators will constraint the set of valid flags to a subset

 * of all flags. We expect them to define CACHE_CREATE_MASK in this

 * case, and we'll just provide them with a sanitized version of the

 * passed flags.

 */

flags &= CACHE_CREATE_MASK;

if (!usersize)

s = __kmem_cache_alias(name, size, align, flags, ctor);      //在此函数中,通过调用find_mergeable,寻找已有的slub中,是否有一个可以和当前要创建的slub合并

if (s)

goto out_unlock;

cache_name = kstrdup_const(name, GFP_KERNEL);         //如果name在用户态地址空间,那么将name拷贝到内核态地址空间

if (!cache_name) {

err = -ENOMEM;

goto out_unlock;

}

//如果没有找到能够merge的slub,我们就新创建一个slub

s = create_cache(cache_name, size,

 calculate_alignment(flags, align, size),

 flags, useroffset, usersize, ctor, NULL);

if (IS_ERR(s)) {

err = PTR_ERR(s);

kfree_const(cache_name);

}

out_unlock:

mutex_unlock(&slab_mutex);

put_online_mems();

put_online_cpus();

return s;

}

static struct kmem_cache *create_cache(const char *name,

unsigned int object_size, unsigned int align,

slab_flags_t flags, unsigned int useroffset,

unsigned int usersize, void (*ctor)(void *),

struct kmem_cache *root_cache)

{

struct kmem_cache *s;

int err;

s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);        //slub中分配一个kmem_cache

if (!s)

goto out;

s->name = name;

s->size = s->object_size = object_size;    //object_size就是slub中,每个结构体的大小

s->align = align;

s->ctor = ctor;

s->useroffset = useroffset;

s->usersize = usersize;

//上面这些参数的值可以通过/sys/kernel/slab/下的文件查看

err = __kmem_cache_create(s, flags);

if (err)

goto out_free_cache;

s->refcount = 1;

list_add(&s->list, &slab_caches);           //slab_cached是所有slub组成的链表

out:

if (err)

return ERR_PTR(err);

return s;

}

3.1.1:__kmem_cache_create

        __kmem_cache_create在slab,slub,slob中有不同的实现。现在一般使用slub。我们看这个函数在slub.c中的实现。

int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)

{

int err;

err = kmem_cache_open(s, flags);

if (err)

return err;

/* Mutex is not taken during early boot */

if (slab_state <= UP)

return 0;

err = sysfs_slab_add(s);           //这个slub已经初始化完成。在sys/kernel/slab中添加对应的文件。之后,我们就可以在这个文件中看和这个slub相关的信息

if (err)

__kmem_cache_release(s);

return err;

}

这个函数调用kmem_cache_open,实现如下

static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)

{

s->flags = kmem_cache_flags(s->size, flags, s->name);

if (!calculate_sizes(s, -1))       //设置slub->order,返回一个slub需要多少个page,对应多少个对象

goto error;

/*

 * The larger the object size is, the more pages we want on the partial

 * list to avoid pounding the page allocator excessively.

 */

set_min_partial(s, ilog2(s->size) / 2);    //这个值可以在/sys/kernel/slab/<slab_name>/ min_partial中查看。设置了kmem_cache.min_partial。当kmem_cache.kmem_cache_node中的page数目大于min_partial,就会释放slub(也就是一个slub包含的order)

set_cpu_partial(s);          //设置struct kmem_cache -> unsigned int cpu_partial。在第3节的图中我们看到,如果kmem_cache_cpu中,partical链表链接的page中的空闲slub对象数目大于cpu_partial,就会将这些page移动到kmem_cache.kmem_cache_nodepartial链表中

if (!init_kmem_cache_nodes(s))    //创建并初始化kmem_cache.kmem_cache_node结构体

goto error;

if (alloc_kmem_cache_cpus(s))

/*

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

……}

struct kmem_cache_cpu是一个每cpu变量。也就是每个cpu都有一个。这个结构体的内容如下

struct kmem_cache_cpu {

void **freelist;        /* Pointer to next available object */

unsigned long tid;        /* Globally unique transaction id */

struct page *page;        /* The slab from which we are allocating */

};

这里分配这个结构体的内存。在后面适当的时候,会给这个结构体赋值

*/

return 0;

error:

__kmem_cache_release(s);

return -EINVAL;

}

3.1.2:calculate_sizes

        函数calculate_sizes决定了一个slub的order,以及数据在slub中的分布。order表示了每个slub占用的内存大小(1<<order)。也就是说,一个slub可能占据多个连续的物理页

/*

 * calculate_sizes() determines the order and the distribution of data within

 * a slab object.

 */

static int calculate_sizes(struct kmem_cache *s, int forced_order)

{

slab_flags_t flags = s->flags;

unsigned int size = s->object_size;

unsigned int order;

/*

 * Round up object size to the next word boundary. We can only

 * place the free pointer at word boundaries and this determines

 * the possible location of the free pointer.

 */

size = ALIGN(size, sizeof(void *));

/*

 * With that we have determined the number of bytes in actual use

 * by the object and redzoning.

 */

s->inuse = size;

if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||

    ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||

    s->ctor) {

……

} else {

/*

 * Store freelist pointer near middle of object to keep

 * it away from the edges of the object to avoid small

 * sized over/underflows from neighboring allocations.

 */

s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));       //slub->offset是偏移量。一个slub中包含了多个对象。我们可以用未分配的slub对象存放下一个未分配的slub对象的首地址,类似形成一个链表。offset就是相对于空闲object的首地址,存放下一个空闲object的首地址的地址的偏移

}

/*

 * SLUB stores one object immediately after another beginning from

 * offset 0. In order to align the objects we have to simply size

 * each object to conform to the alignment.

 */

size = ALIGN(size, s->align);

s->size = size;

s->reciprocal_size = reciprocal_value(size);

if (forced_order >= 0)

order = forced_order;

else

order = calculate_order(size);              //根据slub的每个对象的大小size,计算得到slab使用多少个页。当我们调用函数new_slab_objects给一个struct kmem_cache_cpu分配新的page的时候,就会从伙伴系统中分配1<<orderpage

if ((int)order < 0)

return 0;

s->allocflags = 0;

if (order)

s->allocflags |= __GFP_COMP;

if (s->flags & SLAB_CACHE_DMA)

s->allocflags |= GFP_DMA;

if (s->flags & SLAB_CACHE_DMA32)

s->allocflags |= GFP_DMA32;

if (s->flags & SLAB_RECLAIM_ACCOUNT)

s->allocflags |= __GFP_RECLAIMABLE;

/*

 * Determine the number of objects per slab

 */

s->oo = oo_make(order, size);             //oo记录了sluborder,还记录了一个slub总可以容纳多少个对象

s->min = oo_make(get_order(size), size);

if (oo_objects(s->oo) > oo_objects(s->max))

s->max = s->oo;

return !!oo_objects(s->oo);

}

3.2:给slub分配内存

        我们在上面的内容中,创建了一个slub。但是,这个slub还没有内存。因此,我们在使用这个slub的时候,会给这个slub分配页,再从这个slub中分配对象。

        上面这个过程的调用链是:kmalloc -> __kmalloc -> slab_alloc -> slab_alloc_node -> __slab_alloc -> ___slab_alloc -> new_slab_objects。也就是说,当要给slub从伙伴系统分配新的page的时候,调用函数new_slab_objects。

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,

int node, struct kmem_cache_cpu **pc)

{

void *freelist;

struct kmem_cache_cpu *c = *pc;

struct page *page;

WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

freelist = get_partial(s, flags, node, c);       

if (freelist)

return freelist;

//上面的内容会在3.3:kmalloc分配内存中讨论。上面的流程没有走到伙伴系统中做分配

page = new_slab(s, flags, node);      //从伙伴系统中分配pageslub

if (page) {

c = raw_cpu_ptr(s->cpu_slab);

if (c->page)

flush_slab(s, c);

/*

 * No other reference to the page yet so we can

 * muck around with it freely without cmpxchg

 */

freelist = page->freelist;        //freelist指向了page对应的物理页的虚拟地址首地址,也是存放第一个空闲的slub对象的首地址

page->freelist = NULL;

stat(s, ALLOC_SLAB);

c->page = page;       //pagefreelist是对应的。freelist就是page描述的页面的虚拟地址首地址

*pc = c;

}

return freelist;

}

static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)

{

return allocate_slab(s,

flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);

}

static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)

{

struct page *page;

struct kmem_cache_order_objects oo = s->oo;

gfp_t alloc_gfp;

void *start, *p, *next;

int idx;

bool shuffle;

flags &= gfp_allowed_mask;

if (gfpflags_allow_blocking(flags))

local_irq_enable();

flags |= s->allocflags;

/*

 * Let the initial higher-order allocation fail under memory pressure

 * so we fall-back to the minimum order allocation.

 */

alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;

if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))

alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);

page = alloc_slab_page(s, alloc_gfp, node, oo);        //从伙伴系统中分配出oo_order(oo)个页

page->objects = oo_objects(oo);         //一个slub可能包含多个page,这个slub包含的总对象数目放入objects

page->slab_cache = s;

__SetPageSlab(page);

if (page_is_pfmemalloc(page))

SetPageSlabPfmemalloc(page);

start = page_address(page);          //通过page,获得它对应的虚拟地址。这种地址都是直接映射的地址(物理地址和虚拟地址之间是线性关系)。这种虚拟地址从0xffff888000000000开始

if (1) {

start = fixup_red_left(s, start);

start = setup_object(s, page, start);

page->freelist = start;    //因此,page->freeelist就是page对应的虚拟地址首地址

for (idx = 0, p = start; idx < page->objects - 1; idx++) {

next = p + s->size;

next = setup_object(s, page, next);

set_freepointer(s, p, next);      //*(p+s->offset) = nextnext是下一个空闲的对象块。将他的地址存放在上一个空闲对象块+s->offset

p = next;

}

set_freepointer(s, p, NULL);      //最后一个slub对象的p+s->offset = NULL

}

page->inuse = page->objects;

page->frozen = 1;

out:

if (gfpflags_allow_blocking(flags))

local_irq_disable();

if (!page)

return NULL;

inc_slabs_node(s, page_to_nid(page), page->objects);

return page;

}

因此,我们知道了slub是如何管理空闲对象的分配的。

        1:struct kmem_cache -> struct kmem_cache_cpu -> freelist 表示了当前slub中空闲对象的地址。

        2:struct kmem_cache -> struct kmem_cache_cpu -> freelist + struct kmem_cache ->offset 表示了当前slub中下一个空闲对象的地址

        3:每次做分配操作的时候,分配的对象是struct kmem_cache -> struct kmem_cache_cpu -> freelist ,然后将freelist指向下一个空闲对象,也就是 freelist + offset。

        4:每次做释放操作的时候,将freelist指向释放的对象,然后freelist + offset 变成之前freelist的地址。

3.3:kmalloc分配内存

        这一节我们描述slub是如何进行内存分配的。我们已经知道,内存分配的接口是kmalloc或者kmem_cache_alloc。他们调用链为:

kmalloc -> __kmalloc -> slab_alloc

kmem_cache_alloc -> slab_alloc

        函数slab_alloc的实现如下

static __always_inline void *slab_alloc(struct kmem_cache *s,

gfp_t gfpflags, unsigned long addr)

{

return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);

}

我们在创建一个kmem_cache_t的时候,会给每个cpu创建单独的slub成员链表,并组织在每cpu变量struct kmem_cache_cpu中。这样的好处是避免多cpu之间并发带来的锁开销。重申struct kmem_cache_cpu结构体定义如下:

struct kmem_cache {

struct kmem_cache_cpu __percpu *cpu_slab;

……

};

struct kmem_cache_cpu {

void **freelist;        /* Pointer to next available object */

unsigned long tid;        /* Globally unique transaction id */

struct page *page;        /* The slab from which we are allocating */

};

因此,我们要给slub分配内存,并且根据分配的内存赋值struct kmem_cache_cpu中的成员。

3.3.1:slab_alloc_node

        函数slab_alloc_node实现如下

//和伙伴系统中的分配类似,这里也有快速路径以及慢速路径两种分配方式
static __always_inline void *slab_alloc_node(struct kmem_cache *s,

gfp_t gfpflags, int node, unsigned long addr)

{

void *object;

struct kmem_cache_cpu *c;

struct page *page;

unsigned long tid;

struct obj_cgroup *objcg = NULL;

s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);

if (!s)

return NULL;

redo:

/*

 * Must read kmem_cache cpu data via this cpu ptr. Preemption is

 * enabled. We may switch back and forth between cpus while

 * reading from one cpu area. That does not matter as long

 * as we end up on the original cpu again when doing the cmpxchg.

 *

 * We should guarantee that tid and kmem_cache are retrieved on

 * the same cpu. It could be different if CONFIG_PREEMPTION so we need

 * to check if it is matched or not.

 */

do {

tid = this_cpu_read(s->cpu_slab->tid);

c = raw_cpu_ptr(s->cpu_slab);

} while (IS_ENABLED(CONFIG_PREEMPTION) &&

 unlikely(tid != READ_ONCE(c->tid)));

//已经说过,struct kmem_cache_cpu结构体存储了当前cpu对应的slub对象链表

/*

 * Irqless object alloc/free algorithm used here depends on sequence

 * of fetching cpu_slab's data. tid should be fetched before anything

 * on c to guarantee that object and page associated with previous tid

 * won't be used with current tid. If we fetch tid first, object and

 * page could be one associated with next tid and our alloc/free

 * request will be failed. In this case, we will retry. So, no problem.

 */

barrier();

/*

 * The transaction ids are globally unique per cpu and per operation on

 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double

 * occurs on the right processor and that there was no operation on the

 * linked list in between.

 */

object = c->freelist;         //freelist是这个cpuslub中,空闲对象的首地址

page = c->page;

if (unlikely(!object || !page || !node_match(page, node))) {

object = __slab_alloc(s, gfpflags, node, addr, c);      //慢速分配流程

} else {         //快速分配流程。这时候freelist就是空闲对象

void *next_object = get_freepointer_safe(s, object);   //这个宏展开就是:

(void *)*(unsigned long *)(object + s->offset)。我们已经说过,slub中用未分配的slub存放下一个未分配的slub对象的地址,offset就是偏移。因此,这里拿到了下一个空闲object的首地址

/*

 * The cmpxchg will only match if there was no additional

 * operation and if we are on the right processor.

 *

 * The cmpxchg does the following atomically (without lock

 * semantics!)

 * 1. Relocate first pointer to the current per cpu area.

 * 2. Verify that tid and freelist have not been changed

 * 3. If they were not changed replace tid and freelist

 *

 * Since this is without lock semantics the protection is only

 * against code executing on this cpu *not* from access by

 * other cpus.

 *///this_cpu_cmpxchg_double比较参数12和参数34是否相等。如果是的话,将参数56设置给参数12.其实就是将next_object设置给s->cpu_slab->freelist

if (unlikely(!this_cpu_cmpxchg_double(

s->cpu_slab->freelist, s->cpu_slab->tid,

object, tid,

next_object, next_tid(tid)))) {

note_cmpxchg_failure("slab_alloc", s, tid);

goto redo;

}

prefetch_freepointer(s, next_object);

stat(s, ALLOC_FASTPATH);

}

maybe_wipe_obj_freeptr(s, object);

if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)

memset(object, 0, s->object_size);

slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);

return object;

}

        因此,在快速路径下,我们直接拿到struct kmem_cache -> struct kmem_cache_cpu ->freelist,作为这次分配出来的slub对象。
 

3.3.2:___slab_alloc

        除了上面的快速路径,还有慢速路径。并且,当第一次做slub分配的时候,也会走到慢速路径。这时候,struct kmem_cache_cpu上没有空闲的slub待使用。

static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

  unsigned long addr, struct kmem_cache_cpu *c)

{

void *p;

unsigned long flags;

local_irq_save(flags);

#ifdef CONFIG_PREEMPTION

/*

 * We may have been preempted and rescheduled on a different

 * cpu before disabling interrupts. Need to reload cpu area

 * pointer.

 *///重新获取struct kmem_cache_cpu

c = this_cpu_ptr(s->cpu_slab);

#endif

p = ___slab_alloc(s, gfpflags, node, addr, c);

local_irq_restore(flags);

return p;

}

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

  unsigned long addr, struct kmem_cache_cpu *c)

{

void *freelist;

struct page *page;

stat(s, ALLOC_SLOWPATH);

page = c->page;       //c->page可以看作是slub当前正在使用的pagefreelist应该是和page配套的

if (!page) {         //slub刚刚创建,还没有对应的内存的时候,pageNULL

/*

 * if the node is not online or has no normal memory, just

 * ignore the node constraint

 */

if (unlikely(node != NUMA_NO_NODE &&

     !node_state(node, N_NORMAL_MEMORY)))

node = NUMA_NO_NODE;

goto new_slab;

}

redo:

/* must check again c->freelist in case of cpu migration or IRQ *///如果这时候,freelist不为空,可能是在进入慢速路径的时候,这个上面又有了空闲的slub。从注释看,可能是中断或者进程迁移造成的

freelist = c->freelist;

if (freelist)

goto load_freelist;

freelist = get_freelist(s, page);         //这里的pagestruct kmem_cache_cpu.page。因为new_slab中有redo,所以这时候的page可能是进入此函数时,struct kmem_cache_cpu就带有的page,或者struct kmem_cache_cpu.partical上的page。这个函数最重要的作用就是返回page->freelist,并将page->freelist=NULL

if (!freelist) {                //这种情况下,说明struct kmem_cache_cpu.page也不包含空闲对象

c->page = NULL;  

stat(s, DEACTIVATE_BYPASS);

goto new_slab;       //分配新的page

}

stat(s, ALLOC_REFILL);

load_freelist:

/*

 * freelist is pointing to the list of objects to be used.

 * page is pointing to the page from which the objects are obtained.

 * That page must be frozen for per cpu allocations to work.

 */

VM_BUG_ON(!c->page->frozen);

c->freelist = get_freepointer(s, freelist);       //返回第一个空闲对象,将freelist设置为第二个空闲对象

c->tid = next_tid(c->tid);

return freelist;

new_slab:      //kmem_cache_cpu创建新的slub(分配kmem_cache.order个页)

if (slub_percpu_partial(c)) {          //如果kmem_cache_cpu.partical上有page的话,就从这些page上面分。partical是部分分配的页

page = c->page = slub_percpu_partial(c);      //c->page设置为c->partical

slub_set_percpu_partial(c, page);                  //c->partical设置为之前partical页的下一页

stat(s, CPU_PARTIAL_ALLOC);

//将struct kmem_cache_cpu.page设置为struct kmem_cache_cpu.partical。然后将struct kmem_cache_cpu.partical设置为next page

goto redo;

}

//我们走到这里,一定是遍历了kmem_cache_cpu.partical中所有的页,都没有找到空闲对象。这时候必须分配新的slub

freelist = new_slab_objects(s, gfpflags, node, &c);    //这里就会通过伙伴系统给slub分配新的page,返回的freelist就是slub对象的指针

if (unlikely(!freelist)) {

slab_out_of_memory(s, gfpflags, node);

return NULL;

}

page = c->page;

if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))

goto load_freelist;               //在分配新的page之后,这里会跳回到load_freelist处执行。然后,就会将c->freelist = get_freepointer(s, freelist),也就是第二个空闲slub对象的地址

/* Only entered in the debug case */

if (kmem_cache_debug(s) &&

!alloc_debug_processing(s, page, freelist, addr))

goto new_slab;        /* Slab failed checks. Next slab needed */

deactivate_slab(s, page, get_freepointer(s, freelist), c);

return freelist;

}

        除了上面描述的每个cpu的struct kmem_cache_cpu链接的partical链表之外,还有一个partical链表,也就是每个node的partical链表。我们在函数___slab_alloc中搜索了所有的struct kmem_cache_cpu链接的partical链表。之后,我们会在new_slab_objects -> get_partial中搜索每个node的partical链表。

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,

int node, struct kmem_cache_cpu **pc)

{

void *freelist;

struct kmem_cache_cpu *c = *pc;

struct page *page;

WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));

freelist = get_partial(s, flags, node, c);     //kmem_cache_node找空闲对象

if (freelist)

return freelist;

page = new_slab(s, flags, node);     //还是没有的话,就是kmem_cache_cpukmem_cache_node中都没有空闲对象了,所以要从伙伴系统中分配。这个函数已经在3.2节中描述过了

if (page) {

c = raw_cpu_ptr(s->cpu_slab);

if (c->page)

flush_slab(s, c);

/*

 * No other reference to the page yet so we can

 * muck around with it freely without cmpxchg

 */

freelist = page->freelist;

page->freelist = NULL;

stat(s, ALLOC_SLAB);

c->page = page;

*pc = c;

}

return freelist;

}

static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,

struct kmem_cache_cpu *c)

{

void *object;

int searchnode = node;

if (node == NUMA_NO_NODE)

searchnode = numa_mem_id();

object = get_partial_node(s, get_node(s, searchnode), c, flags);     //找我们struct kmem_cache对应当前nodepartial链表中,是否有空闲对象。返回找到的第一个空闲对象

if (object || node != NUMA_NO_NODE)

return object;

return get_any_partial(s, flags, c);   //找其他NUMA节点中,是否有struct list_head partial链接的page。在UMA中就是return NULL

}

static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,

struct kmem_cache_cpu *c, gfp_t flags)

{

struct page *page, *page2;

void *object = NULL;

unsigned int available = 0;

int objects;

/*

 * Racy check. If we mistakenly see no partial slabs then we

 * just allocate an empty slab. If we mistakenly try to get a

 * partial slab and there is none available then get_partial()

 * will return NULL.

 *///struct kmem_cache_node->partical是一个空链表的时候,我们直接返回NULL

if (!n || !n->nr_partial)

return NULL;

spin_lock(&n->list_lock);

list_for_each_entry_safe(page, page2, &n->partial, slab_list) {          //遍历struct kmem_cache_nodepartical链表,每个成员都是一个page

void *t;

t = acquire_slab(s, n, page, object == NULL, &objects);   //我们将每个page的空闲对象数目存到objects(new.objects - new.inuse),将pagestruct kmem_cache_node->partical链表上移除

if (!t)

break;

available += objects;      //available是从kmem_cache_node移动到kmem_cache_cpu的空闲对象总数目

if (!object) {

c->page = page;         //如果是第一个page,那么设置struct kmem_cache_cpu.page=page。作为后面slub分配使用的page,他遵守1page.freelist=null2page.inuse=.page.objects3page.frozen=1

stat(s, ALLOC_FROM_PARTIAL);

object = t;

} else {

put_cpu_partial(s, page, 0);         //后续的page不做这种处理,只是将这些page添加到kmem_cache_cpu.partical链表上。因此,会将多个partical页从kmem_cache_node移动到kmem_cache_cpu

stat(s, CPU_PARTIAL_NODE);

}

if (!kmem_cache_has_cpu_partial(s)

|| available > slub_cpu_partial(s) / 2)       //如果从kmem_cache_node移动的可用对象数目大于kmem_cache.cpu_partial/2,我们就不再移动了。重申,kmem_cache.cpu_partial可以在/sys/kernel/slab/X/cpu_partial查看

break;

}

spin_unlock(&n->list_lock);

return object;

}

        总结,上面的分配流程有一下三步,也就是从freelist中分配,或者将c->partical中的页赋给c->page,然后再从c->freelist中分配,或者将node->partical中的页分配c->page与c->partical,然后从c->freelist中分配。 

3.4:kfree释放内存

        函数kfree的实现如下:

void kfree(const void *x)

{

struct page *page;

void *object = (void *)x;

page = virt_to_head_page(x);

slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);    //可见,参数分别是slub结构体,page结构体和要释放的对象的首地址

}

static __always_inline void slab_free(struct kmem_cache *s, struct page *page,

      void *head, void *tail, int cnt,

      unsigned long addr)

{

if (slab_free_freelist_hook(s, &head, &tail, &cnt))         //这个函数不影响正常流程的执行

do_slab_free(s, page, head, tail, cnt, addr);

}

        函数do_slab_free的实现如下。他的入参可以认为和slab_free的入参没有变化。

static __always_inline void do_slab_free(struct kmem_cache *s,

struct page *page, void *head, void *tail,

int cnt, unsigned long addr)

{

void *tail_obj = tail ? : head;

struct kmem_cache_cpu *c;

unsigned long tid;

redo:

/*

 * Determine the currently cpus per cpu slab.

 * The cpu may change afterward. However that does not matter since

 * data is retrieved via this pointer. If we are on the same cpu

 * during the cmpxchg then the free will succeed.

 */

do {

tid = this_cpu_read(s->cpu_slab->tid);

c = raw_cpu_ptr(s->cpu_slab);

} while (IS_ENABLED(CONFIG_PREEMPTION) &&

 unlikely(tid != READ_ONCE(c->tid)));

/* Same with comment on barrier() in slab_alloc_node() */

barrier();

if (likely(page == c->page)) {      //我们从分配的一节知道,struct kmem_cache_cpu.pagestruct kmem_cache_cpu.freelist是对应的。因此,如果要释放的slubpage和当前的c->page是一样的,那么直接释放到当前的freelist就行。这也是快速释放

void **freelist = READ_ONCE(c->freelist);

set_freepointer(s, tail_obj, freelist);   //这时候,让释放的slub对象存放freelist。也就是,我们释放的slub对象放在开头

if (unlikely(!this_cpu_cmpxchg_double(    //通过这个函数,设置s->cpu_slab->freelisthead

s->cpu_slab->freelist, s->cpu_slab->tid,

freelist, tid,

head, next_tid(tid)))) {

note_cmpxchg_failure("slab_free", s, tid);

goto redo;

}

stat(s, FREE_FASTPATH);

} else

__slab_free(s, page, head, tail_obj, cnt, addr);

}

3.4.1:慢速路径释放——__slab_free

        慢速分配的路径中,我们从struct kmem_cache_cpu.partical中分,不行的话就从struct kmem_cache_node.partical中分。最后在不行的话就从伙伴系统从分配页。因此,对应释放的时候,也可能释放到struct kmem_cache_cpu.partical或者struct kmem_cache_node.partical。这时候,实际上是要释放的slub,不属于当前的kmem_cache_cpu.page。这种情况很好理解,例如我们要释放的是很久之前分配的slub。这个时候,我们就不能将对象地址添加到kmem_cache.freelist链表中,而要添加到其他的链表中。因为freelist和page一定是对应的

static void __slab_free(struct kmem_cache *s, struct page *page,

void *head, void *tail, int cnt,

unsigned long addr)

//这时,head=tail,是要释放的对象地址

{

void *prior;

int was_frozen;

struct page new;

unsigned long counters;

struct kmem_cache_node *n = NULL;

unsigned long flags;

stat(s, FREE_SLOWPATH);

do {

if (unlikely(n)) {

spin_unlock_irqrestore(&n->list_lock, flags);

n = NULL;

}

prior = page->freelist;       //这个page先前的空闲对象

counters = page->counters;

set_freepointer(s, tail, prior);          //tailhead是同一个指针,指向了这次要释放的对象。因此,将释放的对象放在freelist的最前面

new.counters = counters;

was_frozen = new.frozen;     //kmem_cache_cpu上的page是被frozen

new.inuse -= cnt;

if ((!new.inuse || !prior) && !was_frozen) {   //inuse=0,表示这个page中所有slub被释放。prior=0,表示这个page中所有对象都被分出去,是全分配

if (kmem_cache_has_cpu_partial(s) && !prior) {   //这个page之前是全部分配。全部分配的page不在kmem_cache_cpukmem_cache_nodepartical链表上,而是被提出来了

/*

 * Slab was on no list before and will be

 * partially empty

 * We can defer the list move and instead

 * freeze it.

 */

new.frozen = 1;     //设置pagefrozen标志。我们还暂时不处理这个page,不将他移动到任何partical链表上

} else { /* Needs to be taken off a list *///现在,page中所有对象都空闲,并且之前不是frozen状态,也就是pagestruct kmem_cache->node[node]链表上

n = get_node(s, page_to_nid(page));

/*

 * Speculatively acquire the list_lock.

 * If the cmpxchg does not succeed then we may

 * drop the list_lock without any processing.

 *

 * Otherwise the list_lock will synchronize with

 * other processors updating the list of slabs.

 */

spin_lock_irqsave(&n->list_lock, flags);

}

}

} while (!cmpxchg_double_slab(s, page,       //将释放的对象设置为page->freelist

prior, counters,

head, new.counters,

"__slab_free"));

if (likely(!n)) {        //page中并非所有slub空闲会走到这儿

if (likely(was_frozen)) {

/*

 * The list lock was not taken therefore no list

 * activity can be necessary.

 *///之前page属于cpu_partial,这里简单更改freelist后,返回

stat(s, FREE_FROZEN);         

} else if (new.frozen) {     

/*

 * If we just froze the page then put it onto the

 * per cpu partial list.

 */

put_cpu_partial(s, page, 1);        //之前page是全分配,全分配的页不在任何链表上。会走到这里。将pagefreelist更改后,并且froze page,再添加到cpu_partial

stat(s, CPU_PARTIAL_FREE);

}

return;       

/*

1:之前page属于node_partical,这次释放后不是全不分配,会走到这里,简单更改page的freelist后返回

2:之前page属于cpu_partical,

*/

}

//只有当这个page中的所有slub都是空闲,并且page在struct kmem_cache->node[node]链表上的时候会走到这里

if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))    //之前page属于node_partical,这次释放后是全不分配,并且nodenr_partial 过多,会走到这里,将page释放到伙伴系统

goto slab_empty;

……

spin_unlock_irqrestore(&n->list_lock, flags);

return;                        //之前page属于node_partical,这次释放后是全不分配,并且nodenr_partial数目不多,会走到这里,简单改freelist后返回

slab_empty:

if (prior) {

/*

 * Slab on the partial list.

 */

remove_partial(n, page);

stat(s, FREE_REMOVE_PARTIAL);

} else {

/* Slab must be on the full list */

remove_full(s, n, page);               //这个函数什么也不做

}

spin_unlock_irqrestore(&n->list_lock, flags);

stat(s, FREE_SLAB);

discard_slab(s, page);

}

4:vmalloc

        我们一般使用vmalloc来分配大块内存。他会首先给内核创建一个struct vm_struct,用于存储虚拟地址空间,然后通过伙伴系统分配物理页面,并建立虚拟地址和物理地址之间的映射。vmalloc的实现如下

void *vmalloc(unsigned long size)

{

return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,           //GFP_KERNEL表示在分配过程中,可能会发生内存回收,甚至是进程被阻塞,做直接回收。因此,vmalloc不能使用在原子路径上

__builtin_return_address(0));

}

void *__vmalloc_node(unsigned long size, unsigned long align,

    gfp_t gfp_mask, int node, const void *caller)

{

return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,

gfp_mask, PAGE_KERNEL, 0, node, caller);

//VMALLOC_START, VMALLOC_END是内核中,用于vmalloc的虚拟地址空间的起始和结束地址。我们可以通过linux/Documentation/arm64/memory.rst查看这两个值

}

void *__vmalloc_node_range(unsigned long size, unsigned long align,

unsigned long start, unsigned long end, gfp_t gfp_mask,

pgprot_t prot, unsigned long vm_flags, int node,

const void *caller)

{

struct vm_struct *area;

void *addr;

unsigned long real_size = size;

size = PAGE_ALIGN(size);

if (!size || (size >> PAGE_SHIFT) > totalram_pages())

goto fail;

area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |       //分配vm_struct,用于存储虚拟地址空间结构体

vm_flags, start, end, node, gfp_mask, caller);

if (!area)

goto fail;

addr = __vmalloc_area_node(area, gfp_mask, prot, node);        //分配物理页面,并和vm_struct做映射

if (!addr)

return NULL;

/*

 * In this function, newly allocated vm_struct has VM_UNINITIALIZED

 * flag. It means that vm_struct is not fully initialized.

 * Now, it is fully initialized, so remove this flag here.

 */

clear_vm_uninitialized_flag(area);

return addr;

}

4.1:__get_vm_area_node——分配vm_struct,vmap_area

        函数__get_vm_area_node的作用是分配struct vm_struct和struct vmap_area。内核中可能有许多个vm_struct和vmap_area,他们之间是一一对应的关系。为了能够更快的找到某个地址对应的数据结构图,我们会将struct vmap_area组织成为一棵红黑树。函数实现如下:

static struct vm_struct *__get_vm_area_node(unsigned long size,

unsigned long align, unsigned long flags, unsigned long start,

unsigned long end, int node, gfp_t gfp_mask, const void *caller)

{

struct vmap_area *va;

struct vm_struct *area;

unsigned long requested_size = size;

BUG_ON(in_interrupt());

size = PAGE_ALIGN(size);

if (flags & VM_IOREMAP)

align = 1ul << clamp_t(int, get_count_order_long(size),

       PAGE_SHIFT, IOREMAP_MAX_ORDER);

area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);        //分配struct vm_struct占用的内存

if (!(flags & VM_NO_GUARD))        //如果没有设置VM_NO_GUARD,我们会给size多加一个page,作为保护页

size += PAGE_SIZE;

va = alloc_vmap_area(size, align, start, end, node, gfp_mask);        //分配结构体struct vmap_area结构体并初始化。

if (IS_ERR(va)) {

kfree(area);

return NULL;

}

kasan_unpoison_vmalloc((void *)va->va_start, requested_size);

setup_vmalloc_vm(area, va, flags, caller);    //struct vmap_area -> vm = struct vm_struct

return area;

}

        函数alloc_vmap_area的实现如下所示

static struct vmap_area *alloc_vmap_area(unsigned long size,

unsigned long align,

unsigned long vstart, unsigned long vend,

int node, gfp_t gfp_mask)

{

struct vmap_area *va, *pva;

unsigned long addr;

int purged = 0;

int ret;

gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);    //kmem_cache中分配一个struct vmap_area结构体

if (unlikely(!va))

return ERR_PTR(-ENOMEM);

retry:

……

/*

 * If an allocation fails, the "vend" address is

 * returned. Therefore trigger the overflow path.

 */

addr = __alloc_vmap_area(size, align, vstart, vend);          //这里通过红黑树free_vmap_area_root找到了一个合适的vmap_area,也就是一个满足需求的线性地址空间。这里返回线性地址空间的首地址

spin_unlock(&free_vmap_area_lock);

va->va_start = addr;

va->va_end = addr + size;

va->vm = NULL;

spin_lock(&vmap_area_lock);

insert_vmap_area(va, &vmap_area_root, &vmap_area_list);      //类似于空闲的vmap_area会构成free_vmap_area_root红黑树,使用中的vmap_area也会构成vmap_area_root红黑树

spin_unlock(&vmap_area_lock);

return va;

}

static __always_inline unsigned long

__alloc_vmap_area(unsigned long size, unsigned long align,

unsigned long vstart, unsigned long vend)

{

unsigned long nva_start_addr;

struct vmap_area *va;

enum fit_type type;

int ret;

va = find_vmap_lowest_match(size, align, vstart);       //这个函数在红黑树free_vmap_area_root中,找到满足sizevstart的空闲虚拟地址空间。free_vmap_area_root是空闲的struct vmap_area构成的红黑树,vmap_area起始地址越小,他在红黑树中的位置越靠左

if (unlikely(!va))

return vend;

if (va->va_start > vstart)

nva_start_addr = ALIGN(va->va_start, align);

else

nva_start_addr = ALIGN(vstart, align);

//上面的nva_start_addr就是我们这次要分配出去的struct vmap_area的起始地址

/* Check the "vend" restriction. */

if (nva_start_addr + size > vend)

return vend;

/* Classify what we have found. */

type = classify_va_fit_type(va, nva_start_addr, size);

if (WARN_ON_ONCE(type == NOTHING_FIT))

return vend;

//我们要分配一段size大小的虚拟地址空间。这时候我们找到了一个struct vmap_area。但是,这个vmap_area和size之间可能不是完全匹配。因此,我们这里获取他的匹配程度,分别是FL_FIT_TYPE(完全匹配),LE_FIT_TYPE(左匹配),RE_FIT_TYPE(右匹配),NE_FIT_TYPE(非边缘匹配)。对于不是完全匹配的struct vmap_area,我们可能需要将剩下的线性地址空间再次插入红黑树free_vmap_area_root中

/* Update the free vmap_area. */

ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);    //这里就是我们在上面的注释中描述的逻辑,将剩余的线性地址空间插入红黑树中

if (ret)

return vend;

return nva_start_addr;

}

        因此,我们看到,在给vmalloc分配虚拟地址空间的时候,需要用到内核中的free_vmap_area_root数据结构。他存放了系统中所有空闲的vmap_area,也就是系统中,vmalloc可用的空闲虚拟地址空间。而正在使用的虚拟地址空间,存放在红黑树vmap_area_root中。

4.2:__vmalloc_area_node——分配物理页,并建立虚拟地址和物理地址之间的映射

        函数__vmalloc_area_node的实现如下

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,

 pgprot_t prot, int node)

{

const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;

unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;

unsigned int array_size = nr_pages * sizeof(struct page *), i;

struct page **pages;       //使用page数组存放后面分出来的page指针

gfp_mask |= __GFP_NOWARN;

if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))

gfp_mask |= __GFP_HIGHMEM;

/* Please note that the recursion is strictly bounded. */

if (array_size > PAGE_SIZE) {

pages = __vmalloc_node(array_size, 1, nested_gfp, node,

area->caller);

} else {

pages = kmalloc_node(array_size, nested_gfp, node);

}

//上面分配page数组使用的内存。可见,如果要分配的数组大小太大的时候,可能会嵌套调用_vmalloc

area->pages = pages;

area->nr_pages = nr_pages;

for (i = 0; i < area->nr_pages; i++) {

struct page *page;

if (node == NUMA_NO_NODE)

page = alloc_page(gfp_mask);           //从伙伴系统中分配页

else

page = alloc_pages_node(node, gfp_mask, 0);

area->pages[i] = page;

if (gfpflags_allow_blocking(gfp_mask))

cond_resched();

}

atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),       //通过这个函数做vmalloc虚拟地址空间和物理地址空间之间的映射。这里get_vm_area_size很有讲究,它不映射保护页。因此,当内存越界访问到保护页的时候,就会发生缺页异常报错。此外,需要知道的是,我们这里将映射添加到页表swapper_pg_dir

prot, pages) < 0)

goto fail;

return area->addr;

}

        我们看到,上面通过vmalloc更新了页表swapper_pg_dir中的页表项。同时,我们还知道,不管是内核线程,或者是在内核态执行的用户进程,在访问内核地址的时候,都是使用页表swapper_pg_dir。因此,映射之后,所有进程都能够访问这个映射的地址。

5:out of memory

        这个函数就是我们熟知的oom流程。在系统内存不足的时候,会选出一个进程杀掉。

/**

 * out_of_memory - kill the "best" process when we run out of memory

 * @oc: pointer to struct oom_control

 *

 * If we run out of memory, we have the choice between either

 * killing a random task (bad), letting the system crash (worse)

 * OR try to be smart about which process to kill. Note that we

 * don't have to be perfect here, we just have to be good.

 */

bool out_of_memory(struct oom_control *oc)

{

unsigned long freed = 0;

if (oom_killer_disabled)

return false;

/*

 * If current has a pending SIGKILL or is exiting, then automatically

 * select it.  The goal is to allow it to allocate so that it may

 * quickly exit and free its memory.

 */

if (task_will_free_mem(current)) {          //在这里检查current是否正要退出。如果是的话,就选择杀死current进程

mark_oom_victim(current);

wake_oom_reaper(current);      //唤醒内核线程oom_reaper处理

return true;

}

/*

 * The OOM killer does not compensate for IO-less reclaim.

 * pagefault_out_of_memory lost its gfp context so we have to

 * make sure exclude 0 mask - all other users should have at least

 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to

 * invoke the OOM killer even if it is a GFP_NOFS allocation.

 */

if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))

return true;

/*

 * Check if there were limitations on the allocation (only relevant for

 * NUMA and memcg) that may require different handling.

 */

oc->constraint = constrained_alloc(oc);

if (oc->constraint != CONSTRAINT_MEMORY_POLICY)

oc->nodemask = NULL;

check_panic_on_oom(oc);             //检查发生oom的时候,是否要做系统panic。一般不做

if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&       //如果设置了sysctl_oom_kill_allocating_task,则发生oom的时候,会将当前要内存分配的进程杀掉。这个值可以通过/proc/sys/vm/oom_kill_allocating_task设置

    current->mm && !oom_unkillable_task(current) &&

    oom_cpuset_eligible(current, oc) &&

    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {

get_task_struct(current);

oc->chosen = current;

oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");   //这时候,杀死导致内存分配失败的进程,而不是选择一个进程杀死

return true;

}

select_bad_process(oc);      //一般流程,我们选择一个进程杀死

/* Found nothing?!?! */

if (!oc->chosen) {

dump_header(oc, NULL);

pr_warn("Out of memory and no killable processes...\n");

/*

 * If we got here due to an actual allocation at the

 * system level, we cannot survive this and will enter

 * an endless loop in the allocator. Bail out now.

 */

if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))

panic("System is deadlocked on memory\n");

}

if (oc->chosen && oc->chosen != (void *)-1UL)

oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :          //杀死我们选择的进程

 "Memory cgroup out of memory");

return !!oc->chosen;

}

5.1:select_bad_process——选择一个进程杀死

        一般情况下,我们会找到系统中一个合适的进程杀掉来解决oom。

/*

 * Simple selection loop. We choose the process with the highest number of

 * 'points'. In case scan was aborted, oc->chosen is set to -1.

 */

static void select_bad_process(struct oom_control *oc)

{

oc->chosen_points = LONG_MIN;

if (is_memcg_oom(oc))

mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);

else {

struct task_struct *p;

rcu_read_lock();

for_each_process(p)        //遍历系统中的所有进程。通过task_struct -> tasks.next完成遍历

if (oom_evaluate_task(p, oc))

break;

rcu_read_unlock();

}

}

static int oom_evaluate_task(struct task_struct *task, void *arg)

{

struct oom_control *oc = arg;

long points;

if (oom_unkillable_task(task))  //内核线程,和进程1属于同一个线程组的进程不能杀

goto next;

……

points = oom_badness(task, oc->totalpages);         //计算当前进程的oom得分

if (points == LONG_MIN || points < oc->chosen_points)     //我们最终杀死得分最高的那个进程

goto next;

select:

if (oc->chosen)

put_task_struct(oc->chosen);

get_task_struct(task);

oc->chosen = task;

oc->chosen_points = points;

next:

return 0;

abort:

if (oc->chosen)

put_task_struct(oc->chosen);

oc->chosen = (void *)-1UL;

return 1;

}

long oom_badness(struct task_struct *p, unsigned long totalpages)

{

long points;

long adj;

if (oom_unkillable_task(p))

return LONG_MIN;

p = find_lock_task_mm(p);

if (!p)

return LONG_MIN;

/*

 * Do not even consider tasks which are explicitly marked oom

 * unkillable or have been already oom reaped or the are in

 * the middle of vfork

 */

adj = (long)p->signal->oom_score_adj;       //可以通过/proc/PID/oom_score_adj查看

if (adj == OOM_SCORE_ADJ_MIN ||           //oom_score_adj设置为-1000的进程不会被oom杀死

test_bit(MMF_OOM_SKIP, &p->mm->flags) ||

in_vfork(p)) {

task_unlock(p);

return LONG_MIN;

}

/*

 * The baseline for the badness score is the proportion of RAM that each

 * task's rss, pagetable and swap space use.

 */

points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +

mm_pgtables_bytes(p->mm) / PAGE_SIZE;       //计算进程使用的页数

task_unlock(p);

/* Normalize to oom_score_adj units */

adj *= totalpages / 1000;

points += adj;           //将分数做处理后,返回分数

return points;

}

        每个进程的oom得分数还可以通过/proc/pid/oom_score查看。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值