linux 内存管理-slab分配器

伙伴系统用于分配以page为单位的内存,在实际中很多内存需求是以Byte为单位的,如果需要分配以Byte为单位的小内存块时,该如何分配呢?

slab分配器就是用来解决小内存块分配问题,也是内存分配中非常重要的角色之一。

slab分配器最终还是由伙伴系统分配出实际的物理内存,只不过slab分配器在这些连续的物理页面上实现了自己的算法,以此来对小内存块进行管理。

slab分配器把对象分组放进高速缓存,每个高速缓存都是同种类型对象的一种"储备"。包含高速缓存的主内存被划分为多个slab,每个slab由一个或多个连续的物理页面组成。这些页框中即包含已分配的对象,也包含空闲的对象。

本文将整个slab分配器所有相关的代码都进行了相关的注释,以及其中涉及的理论内容都进行了详细的描述。由于linux内核中slab分配器的重要性,不可能几句话就将其实现过程描述清楚,尤其是那些对slab分配器内部细节感兴趣的童靴,可能需要花点时间和耐心来阅读。笔者已经进行过整理,大家按照下面的代码直接阅读理解就行。

1、创建slab描述符

struct kmem_cache数据结构是slab分配器中的核心数据结构,称其为slab描述符。struct kmem_cache定义如下:

include/linux/slab_def.h
/*
 * Definitions unique to the original Linux SLAB allocator.
 */

/*每个slab描述符都由一个struct kmem_cache数据结构来抽象描述*/
struct kmem_cache {

	/*一个Per-CPU的struct array_cache,每个CPU一个,表示本地CPU的对象缓存池*/
	struct array_cache __percpu *cpu_cache;

/* 1) Cache tunables. Protected by slab_mutex */

	/*表示当前CPU的本地对象缓存池array_cache为空时,从共享的缓冲池或则slabs_partial/slabs_free列表中获取对象的数目*/
	unsigned int batchcount;

	/*当本地对象缓存池的空闲对象数目大于limit时,会主动释放batchcount个对象,便于内核回收和销毁slab*/
	unsigned int limit;

	/*用于多核系统*/
	unsigned int shared;

	/*对象的长度,这个长度要加上align对齐字节*/
	unsigned int size;
	struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

	/*对象的分配掩码*/
	unsigned int flags;		/* constant flags */

	/*一个slab中最多可以有多少个对象*/
	unsigned int num;		/* # of objs per slab */

/* 3) cache_grow/shrink */
	/* order of pgs per slab (2^n) */

	/*一个slab中占用2^gfporder个页面*/
	unsigned int gfporder;

	/* force GFP flags, e.g. GFP_DMA */
	gfp_t allocflags;

	/*一个slab中有几个不同的cache line*/
	size_t colour;			/* cache colouring range */

	/*一个cache colour的长度,和L1缓存行相同*/
	unsigned int colour_off;	/* colour offset */

	/*off-slab时使用,将freelist放在slab物理页面外部*/
	struct kmem_cache *freelist_cache;

	/*每个对象要占用1字节来存放freelist*/
	unsigned int freelist_size;

	/* constructor func */
	void (*ctor)(void *obj);

/* 4) cache creation/removal */
	/*slab描述符的名称*/
	const char *name;
	struct list_head list;
	/*引用次数,释放slab描述符时会判断,只有引用次数为0时才真正释放*/
	int refcount;

	/*对象的实际大小*/
	int object_size;

	/*对齐的长度*/
	int align;

/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
	unsigned long num_active;
	unsigned long num_allocations;
	unsigned long high_mark;
	unsigned long grown;
	unsigned long reaped;
	unsigned long errors;
	unsigned long max_freeable;
	unsigned long node_allocs;
	unsigned long node_frees;
	unsigned long node_overflow;
	atomic_t allochit;
	atomic_t allocmiss;
	atomic_t freehit;
	atomic_t freemiss;

	/*
	 * If debugging is enabled, then the allocator can add additional
	 * fields and/or padding to every object. size contains the total
	 * object size including these internal fields, the following two
	 * variables contain the offset to the user object and its size.
	 */
	int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
	struct memcg_cache_params memcg_params;
#endif

	/*slab节点,在NUMA系统中每个节点有一个struct kmem_cache_node数据结构,在ARM Vexpress平台中,只有一个节点*/
	struct kmem_cache_node *node[MAX_NUMNODES];
};

slab描述符给每个CPU都提供一个对象缓存池(array_cache)

/*
 * struct array_cache
 *
 * Purpose:
 * - LIFO ordering, to hand out cache-warm objects from _alloc
 * - reduce the number of linked list operations
 * - reduce spinlock operations
 *
 * The limit is stored in the per-cpu structure to reduce the data cache footprint.
 *
 */

struct array_cache {
	/*对象缓存池中可用的对象数目*/
	unsigned int avail;

	/*limit,batchcount和struct kmem_cache中语义一致*/
	unsigned int limit;
	unsigned int batchcount;

	/*从缓存池移除一个对象时,将touched置1;而收缩缓存时,将touched置0*/
	unsigned int touched;

	/*保存对象的实体*/
	void *entry[];	/*
			 * Must have this definition in here for the proper
			 * alignment of array_cache. Also simplifies accessing
			 * the entries.
			 *
			 * Entries should not be directly dereferenced as
			 * entries belonging to slabs marked pfmemalloc will
			 * have the lower bits set SLAB_OBJ_PFMEMALLOC
			 */
};

struct kmem_cache_node为slab节点,从伙伴系统分配的物理页面由其进行管理。

/*The slab lists for all objects*/
struct kmem_cache_node {
	spinlock_t list_lock;

#ifdef CONFIG_SLAB
	/*注意,三个链表上挂载的是页表*/

	/*slab部分链表,即其链表成员中的物理内存部分用于分配slab对象*/
	struct list_head slabs_partial;	/* partial list first, better asm code */
	/*slab满链表,即其链表成员中的物理内存全部用于分配slab对象*/
	struct list_head slabs_full;
	/*slab空闲链表,即其链表成员中的物理内存全部空闲,未用于分配slab对象*/
	struct list_head slabs_free;

	/*三个链表中所有空闲对象数目*/
	unsigned long free_objects;
	/*slab中可以容许的空闲对象数目最大阈值*/
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	/*多核cpu中,共享缓存区slab对象*/
	struct array_cache *shared;	/* shared per node */
	struct alien_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif

#ifdef CONFIG_SLUB
	unsigned long nr_partial;
	struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
	atomic_long_t nr_slabs;
	atomic_long_t total_objects;
	struct list_head full;
#endif
#endif

};

 下面是部分全局变量,后面代码中会用到,先贴出来:

mm/slab_common.c
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;

include/linux/slab.h 下面是分配器中配置参数,其中slab,slob,slub分配器的参数存在差异:

#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT) /*1<<6= 64*/

/*
 * Memory returned by kmalloc() may be used for DMA, so we must make
 * sure that all such allocations are cache aligned. Otherwise,
 * unrelated code may cause parts of the buffer to be read into the
 * cache before the transfer is done, causing old data to be seen by
 * the CPU.
 */
#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES


/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
 */
#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
/*有效*/
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN  /*64*/
#define KMALLOC_MIN_SIZE 	ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW 	ilog2(ARCH_DMA_MINALIGN) /*6*/
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif



#ifdef CONFIG_SLAB
/*
 * The largest kmalloc size supported by the SLAB allocators is
 * 32 megabyte (2^25) or the maximum allocatable page order if that is
 * less than 32 MB.
 *
 * WARNING: Its not easy to increase this value since the allocators have
 * to do various tricks to work around compiler limitations in order to
 * ensure proper constant folding.
 */
/*22*/
#define KMALLOC_SHIFT_HIGH	((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? (MAX_ORDER + PAGE_SHIFT - 1) : 25)
#define KMALLOC_SHIFT_MAX	KMALLOC_SHIFT_HIGH /*22*/
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW	5
#endif
#endif

#ifdef CONFIG_SLUB
/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)  /*13*/
#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT) /*11+12=23*/
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW	3
#endif
#endif

#ifdef CONFIG_SLOB
/*
 * SLOB passes all requests larger than one page to the page allocator.
 * No kmalloc array is necessary since objects of different sizes can
 * be allocated from the same page.
 */
#define KMALLOC_SHIFT_HIGH	PAGE_SHIFT /*12*/
#define KMALLOC_SHIFT_MAX	30
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW	3
#endif
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocagtor */
#define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*kmalloc分配的最小值,slab分配器的值为32,slob,slub分配器值为8*/
#define KMALLOC_MIN_SIZE       (1 << KMALLOC_SHIFT_LOW)

/*分配器中对象最小值,slab分配器的值为16,slob,slub分配器值为8*/
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? (KMALLOC_MIN_SIZE) : 16)

 kmem_cache_create的实现: 

/*
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align,unsigned long flags, void (*ctor)(void *))
{
	struct kmem_cache *s;
	const char *cache_name;
	int err;

	get_online_cpus();
	get_online_mems();
	memcg_get_cache_ids();

	/*获取slab锁*/
	mutex_lock(&slab_mutex);

	err = kmem_cache_sanity_check(name, size);
	if (err) {
		s = NULL;	/* suppress uninit var warning */
		goto out_unlock;
	}

	/*
	 * Some allocators will constraint the set of valid flags to a subset
	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
	 * case, and we'll just provide them with a sanitized version of the
	 * passed flags.
	 */
	flags &= CACHE_CREATE_MASK;

	/*查找是否有现成的slab描述符可以复用,若没有则新建一个slab描述符*/
	s = __kmem_cache_alias(name, size, align, flags, ctor);
	if (s)
		goto out_unlock;

	cache_name = kstrdup_const(name, GFP_KERNEL);
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}

	printk("[f]%s,%s,%x,%x,%x \r\n",__func__,name,size,align,flags);
	/*创建新的slab描述符*/
	s = do_kmem_cache_create(cache_name, size, size,calculate_alignment(flags, align, size),flags, ctor, NULL, NULL);
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
		kfree_const(cache_name);
	}

out_unlock:
	mutex_unlock(&slab_mutex);

	memcg_put_cache_ids();
	put_online_mems();
	put_online_cpus();

	if (err) {
		if (flags & SLAB_PANIC)
			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",name, err);
		else {
			printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",name, err);
			dump_stack();
		}
		return NULL;
	}
	printk("[e]%s,%s,%x,%x,%x,%x \r\n",__func__,name,s->object_size,s->size,s->align,s->flags);
	return s;
}

首先通过__kmem_cache_alias查找是否有现成的slab描述符可以复用,若没有则通过do_kmem_cache_create创建一个新的slab描述符。

下面是内核中kmem_cache_create打印信息:

[ 0.000000] [f]kmem_cache_create,idr_layer_cache,42c,0,40000

[ 0.000000] -----[f],calculate_slab_order,430,8,80040000,1,a,ff

[ 0.000000] calculate_slab_order,0,370,3

[ 0.000000] calculate_slab_order,1,2b0,7

[ 0.000000] -----[e],calculate_slab_order,1,2b0

[ 0.000000] [e]kmem_cache_create,idr_layer_cache,42c,430,8,40000

[ 0.000000] [f]kmem_cache_create,ftrace_event_field,20,4,40000

[ 0.000000] -----[f],calculate_slab_order,20,8,40000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,7c

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] [e]kmem_cache_create,ftrace_event_field,20,20,8,40000

[ 0.000000] [f]kmem_cache_create,ftrace_event_file,30,4,40000

[ 0.000000] -----[f],calculate_slab_order,30,8,40000,1,a,ff

[ 0.000000] calculate_slab_order,0,18,53

[ 0.000000] -----[e],calculate_slab_order,0,18

[ 0.000000] [e]kmem_cache_create,ftrace_event_file,30,30,8,40000

[ 0.000000] [f]kmem_cache_create,radix_tree_node,130,0,60000

[ 0.000000] -----[f],calculate_slab_order,130,8,80060000,1,a,ff

[ 0.000000] calculate_slab_order,0,90,d

[ 0.000000] -----[e],calculate_slab_order,0,90

[ 0.000000] [e]kmem_cache_create,radix_tree_node,130,130,8,60000

上面的打印信息可知,slab分配器最终的对齐大小值(align)是8,因为在ARM32中架构要求的SLAB最小对齐长度为8字节。

#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)。

如果系统存在能够合并slab描述符,会尝试进行合并操作。具体如下:

struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align,unsigned long flags, void (*ctor)(void *))
{
	struct kmem_cache *cachep;

	cachep = find_mergeable(size, align, flags, name, ctor);
	if (cachep) {
		cachep->refcount++; /*slab描述符引用次数*/

		/*
		 * Adjust the object sizes so that we clear
		 * the complete object on kzalloc.
		 */
		cachep->object_size = max_t(int, cachep->object_size, size);
	}
	return cachep;
}

struct kmem_cache *find_mergeable(size_t size, size_t align,unsigned long flags, const char *name, void (*ctor)(void *))
{
	struct kmem_cache *s;

	/*不支持merge的情况下直接退出*/
	if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
		return NULL;

	if (ctor) /*必须传入ctor析构函数*/
		return NULL;

	/*size关于sizeof(void *)对齐*/
	size = ALIGN(size, sizeof(void *));
	/*修正align值*/
	align = calculate_alignment(flags, align, size);
	size = ALIGN(size, align);
	flags = kmem_cache_flags(size, flags, name, NULL);

	/*遍历slab_caches链表上的slab描述符*/
	list_for_each_entry_reverse(s, &slab_caches, list) {
		if (slab_unmergeable(s)) /*跳过不能merge的slab描述符*/
			continue;

		if (size > s->size) /*size不匹配*/
			continue;

		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
			continue;
		/*
		 * Check if alignment is compatible.
		 * Courtesy of Adrian Drzewiecki
		 */
		if ((s->size & ~(align - 1)) != s->size)  /*s->size关于align对齐*/
			continue;

		if (s->size - size >= sizeof(void *)) /*size之差不超过4字节*/
			continue;

		if (IS_ENABLED(CONFIG_SLAB) && align && (align > s->align || s->align % align))
			continue;

		return s;
	}
	return NULL;
}

#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
/*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
 */
unsigned long calculate_alignment(unsigned long flags,unsigned long align, unsigned long size)
{
	/*
	 * If the user wants hardware cache aligned objects then follow that
	 * suggestion if the object is sufficiently large.
	 *
	 * The hardware cache alignment cannot override the specified
	 * alignment though. If that is greater then use it.
	 */
	if (flags & SLAB_HWCACHE_ALIGN) {
		unsigned long ralign = cache_line_size();
		while (size <= ralign / 2)
			ralign /= 2;
		align = max(align, ralign); /*修正align值*/
	}

	if (align < ARCH_SLAB_MINALIGN) /*架构要求的SLAB最小对齐长度为8字节*/
		align = ARCH_SLAB_MINALIGN;

	return ALIGN(align, sizeof(void *));
}

static struct kmem_cache * do_kmem_cache_create(const char *name, size_t object_size, size_t size,
		     size_t align, unsigned long flags, void (*ctor)(void *),
		     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
	struct kmem_cache *s;
	int err;

	err = -ENOMEM;

	/*从kmem_cache中分配struct kmem_cache数据结构*/
	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
	if (!s)
		goto out;

	s->name = name;
	/*对象实际大小*/
	s->object_size = object_size;
	/*对象的长度,这个长度要加上align对齐字节,后面对齐后会重新设置size的值*/
	s->size = size;
	s->align = align;
	s->ctor = ctor;

	err = init_memcg_params(s, memcg, root_cache);
	if (err)
		goto out_free_cache;

	/*初始化slab描述符内部其他成员*/
	err = __kmem_cache_create(s, flags);
	if (err)
		goto out_free_cache;

	/*slab描述符引用次数设置为1*/
	s->refcount = 1;
	/*将新分配的struct kmem_cache挂载到slab_cachesi链表上*/
	list_add(&s->list, &slab_caches);
out:
	if (err)
		return ERR_PTR(err);
	return s;

out_free_cache:
	destroy_memcg_params(s);
	kmem_cache_free(kmem_cache, s);
	goto out;
}
#define	BYTES_PER_WORD		sizeof(void *)
#define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
#define 	FREELIST_BYTE_INDEX 		(((PAGE_SIZE >> BITS_PER_BYTE) <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
#define      SLAB_OBJ_MAX_NUM 		((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /*0xff,255*/

/**
 * __kmem_cache_create - Create a cache.
 * @cachep: cache management descriptor
 * @flags: SLAB flags
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
	size_t left_over, freelist_size;
	/*word对齐,物理地址是word对齐的话,那么进行ddr防存时更加高效*/
	size_t ralign = BYTES_PER_WORD;
	gfp_t gfp;
	int err;
	size_t size = cachep->size;

#if DEBUG
#if FORCED_DEBUG
	/*
	 * Enable redzoning and last user accounting, except for caches with
	 * large objects, if the increased size would increase the object size
	 * above the next power of two: caches with object sizes just above a
	 * power of two have a significant amount of internal fragmentation.
	 */
	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
						2 * sizeof(unsigned long long)))
		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
	if (!(flags & SLAB_DESTROY_BY_RCU))
		flags |= SLAB_POISON;
#endif
	if (flags & SLAB_DESTROY_BY_RCU)
		BUG_ON(flags & SLAB_POISON);
#endif

	/*
	 * Check that size is in terms of words.  This is needed to avoid
	 * unaligned accesses for some archs when redzoning is used, and makes
	 * sure any on-slab bufctl's are also correctly aligned.
	 */
	/*size自身也需要关于BYTES_PER_WORD对齐*/
	if (size & (BYTES_PER_WORD - 1)) { /*size没有对齐到BYTES_PER_WORD*/
		size += (BYTES_PER_WORD - 1);
		size &= ~(BYTES_PER_WORD - 1); /*size关于BYTES_PER_WORD向上取整*/
	}

	/*如果flags设置了SLAB_RED_ZONE标志,size自身需要关于REDZONE_ALIGN对齐*/
	if (flags & SLAB_RED_ZONE) {
		ralign = REDZONE_ALIGN;
		/* If redzoning, ensure that the second redzone is suitably
		 * aligned, by adjusting the object size accordingly. */
		size += REDZONE_ALIGN - 1;
		size &= ~(REDZONE_ALIGN - 1); /*size关于REDZONE_ALIGN向上对齐*/
	}

	/* 3) caller mandated alignment */
	if (ralign < cachep->align) {
		ralign = cachep->align; /*以设置的align为准*/
	}

	/* disable debug if necessary */
	if (ralign > __alignof__(unsigned long long))
		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
	/*
	 * 4) Store it.
	 */
	/*最终align值*/
	cachep->align = ralign;

	/*slab分配器状态*/
	if (slab_is_available())
		gfp = GFP_KERNEL;
	else
		gfp = GFP_NOWAIT;

#if DEBUG

	/*
	 * Both debugging options require word-alignment which is calculated
	 * into align above.
	 */
	if (flags & SLAB_RED_ZONE) {
		/* add space for red zone words */
		cachep->obj_offset += sizeof(unsigned long long);
		size += 2 * sizeof(unsigned long long);
	}
	if (flags & SLAB_STORE_USER) {
		/* user store requires one word storage behind the end of
		 * the real object. But if the second red zone needs to be
		 * aligned to 64 bits, we must allow that much space.
		 */
		if (flags & SLAB_RED_ZONE)
			size += REDZONE_ALIGN;
		else
			size += BYTES_PER_WORD;
	}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
	if (size >= kmalloc_size(INDEX_NODE + 1)
	    && cachep->object_size > cache_line_size()
	    && ALIGN(size, cachep->align) < PAGE_SIZE) {
		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
		size = PAGE_SIZE;
	}
#endif
#endif

	/*
	 * Determine if the slab management is 'on' or 'off' slab.
	 * (bootstrapping cannot cope with offslab caches so don't do
	 * it too early on. Always use on-slab management when
	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
	 */
	if ((size >= (PAGE_SIZE >> 5)/*128*/) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE))
		/*
		 * Size is large, assume best to place the slab management obj
		 * off-slab (should allow better packing of objs).
		 */
		flags |= CFLGS_OFF_SLAB; /*off-slab*/

	/*size做对齐处理*/
	size = ALIGN(size, cachep->align);
	/*
	 * We should restrict the number of objects in a slab to implement
	 * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
	 */
	/*如果size小于SLAB_OBJ_MIN_SIZE,size取SLAB_OBJ_MIN_SIZE,且关于align对齐*/
	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 
		size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); /*最小size值为SLAB_OBJ_MIN_SIZE,该值关于align对齐*/

	/*最终obj对象的size大小和align对齐大小和传入的参数可能相去甚远,因为检查过程中会修改size和align值*/
	/*
		计算一个slab需要多少个物理页面,同时页计算slabg中可以容纳多少个对象;
		此时还未进行物理页面的真正分配;
		通过增加打印信息,基本上小内存块size分配一个物理页面,大的内存块size也是分配少量物理页面满足需求即可。
	*/
	left_over = calculate_slab_order(cachep, size, cachep->align, flags);

	if (!cachep->num)
		return -E2BIG;

	/*freelist_size整体对齐所需空间*/
	freelist_size = calculate_freelist_size(cachep->num, cachep->align);

	/*
	 * If the slab has been placed off-slab, and we have enough space then
	 * move it on-slab. This is at the expense of any extra colouring.
	 */
	if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
		flags &= ~CFLGS_OFF_SLAB; /*如果剩余空间足够大,就取消CFLGS_OFF_SLAB,放在slab内部*/
		left_over -= freelist_size;
	}

	if (flags & CFLGS_OFF_SLAB) {
		/* really off slab. No need for manual alignment */
		freelist_size = calculate_freelist_size(cachep->num, 0);

#ifdef CONFIG_PAGE_POISONING
		/* If we're going to use the generic kernel_map_pages()
		 * poisoning, then it's going to smash the contents of
		 * the redzone and userword anyhow, so switch them off.
		 */
		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
	}

	/*colour_off大小为cache line的大小*/
	cachep->colour_off = cache_line_size(); /*cache line大小*/
	/* Offset must be a multiple of the alignment. */
	if (cachep->colour_off < cachep->align)
		cachep->colour_off = cachep->align;

	/*剩余空间中能够用于cachep->colour_off的个数,即着色使用的空间*/
	cachep->colour = left_over / cachep->colour_off;
	cachep->freelist_size = freelist_size;
	cachep->flags = flags;
	/*这里设置compound标志*/
	cachep->allocflags = __GFP_COMP;
	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
		cachep->allocflags |= GFP_DMA;

	/*重新设置size值*/
	cachep->size = size;
	cachep->reciprocal_buffer_size = reciprocal_value(size);

	if (flags & CFLGS_OFF_SLAB) {
		/*如果管理数据off slab,则分配空间*/
		cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
		/*
		 * This is a possibility for one of the kmalloc_{dma,}_caches.
		 * But since we go off slab only for object size greater than
		 * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
		 * in ascending order,this should not happen at all.
		 * But leave a BUG_ON for some lucky dude.
		 */
		BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
	}

	/*设置cpu cache*/
	err = setup_cpu_cache(cachep, gfp);
	if (err) {
		__kmem_cache_shutdown(cachep);
		return err;
	}

	return 0;
}

/**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
static size_t calculate_slab_order(struct kmem_cache *cachep,size_t size, size_t align, unsigned long flags)
{
	unsigned long offslab_limit;
	size_t left_over = 0;
	int gfporder;

	printk("-----[f],%s,%x,%x,%x,%x,%x,%x \r\n",__func__,size,align,flags,slab_max_order,KMALLOC_MAX_ORDER,SLAB_OBJ_MAX_NUM);
	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
		unsigned int num;
		size_t remainder;

		/*预估占用内存页面的空间,slab对象个数,剩余空间*/
		cache_estimate(gfporder, size, align, flags, &remainder, &num);
		printk("%s,%x,%x,%x \r\n",__func__,gfporder,remainder,num);
		if (!num) /*最少也要能分配出一个obj对象*/
			continue;

		/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
		if (num > SLAB_OBJ_MAX_NUM) /*对num最大值的限制(255)*/
			break;

		if (flags & CFLGS_OFF_SLAB) { /*管理slab的数据存放在slab自己内存空间之外*/
			size_t freelist_size_per_obj = sizeof(freelist_idx_t);
			/*
			 * Max number of objs-per-slab for caches which
			 * use off-slab slabs. Needed to avoid a possible
			 * looping condition in cache_grow().
			 */
			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
				freelist_size_per_obj += sizeof(char);
			offslab_limit = size;
			offslab_limit /= freelist_size_per_obj;

 			if (num > offslab_limit) /*避免freelist占用过多空间*/
				break;
		}

		/* Found something acceptable - save it away */
		cachep->num = num;
		cachep->gfporder = gfporder;
		left_over = remainder;

		/*
		 * A VFS-reclaimable slab tends to have most allocations
		 * as GFP_NOFS and we really don't want to have to be allocating
		 * higher-order pages when we are unable to shrink dcache.
		 */
		if (flags & SLAB_RECLAIM_ACCOUNT)
			break;

		/*
		 * Large number of objects is good, but very large slabs are
		 * currently bad for the gfp()s.
		 */
		if (gfporder >= slab_max_order) /*slab_max_order为1*/
			break;

		/*
		 * Acceptable internal fragmentation?
		 */
		if (left_over * 8 <= (PAGE_SIZE << gfporder)) /*剩余空间的占比<=总物理页面大小/8*/
			break;
	}
	printk("-----[e],%s,%x,%x\r\n",__func__,gfporder,left_over);
	return left_over;
}

calculate_slab_order会计算一个slab需要多少个物理页面,同时计算slab中可以容纳多少个对象。

一个slab由2^gfporder个连续物理页面组成,包含num个slab对象,着色区和freelist区。

下面是内核启动阶段的打印信息:

[ 0.000000] -----[f],calculate_slab_order,80,40,2000,1,a,ff

[ 0.000000] calculate_slab_order,0,40,1f

[ 0.000000] -----[e],calculate_slab_order,0,40

[ 0.000000] -----[f],calculate_slab_order,40,40,2000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,3f

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,80,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,20

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,c0,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,40,15

[ 0.000000] -----[e],calculate_slab_order,0,40

[ 0.000000] -----[f],calculate_slab_order,100,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,10

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,200,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,8

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,400,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,4

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,800,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,2

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,1000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,0,1

[ 0.000000] -----[e],calculate_slab_order,0,0

[ 0.000000] -----[f],calculate_slab_order,2000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,0,1

[ 0.000000] -----[e],calculate_slab_order,1,0

[ 0.000000] -----[f],calculate_slab_order,4000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,0,1

[ 0.000000] -----[e],calculate_slab_order,2,0

[ 0.000000] -----[f],calculate_slab_order,8000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,0,1

[ 0.000000] -----[e],calculate_slab_order,3,0

[ 0.000000] -----[f],calculate_slab_order,10000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,0,1

[ 0.000000] -----[e],calculate_slab_order,4,0

[ 0.000000] -----[f],calculate_slab_order,20000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,0,1

[ 0.000000] -----[e],calculate_slab_order,5,0

[ 0.000000] -----[f],calculate_slab_order,40000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,20000,0

[ 0.000000] calculate_slab_order,6,0,1

[ 0.000000] -----[e],calculate_slab_order,6,0

[ 0.000000] -----[f],calculate_slab_order,80000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,20000,0

[ 0.000000] calculate_slab_order,6,40000,0

[ 0.000000] calculate_slab_order,7,0,1

[ 0.000000] -----[e],calculate_slab_order,7,0

[ 0.000000] -----[f],calculate_slab_order,100000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,20000,0

[ 0.000000] calculate_slab_order,6,40000,0

[ 0.000000] calculate_slab_order,7,80000,0

[ 0.000000] calculate_slab_order,8,0,1

[ 0.000000] -----[e],calculate_slab_order,8,0

[ 0.000000] -----[f],calculate_slab_order,200000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,20000,0

[ 0.000000] calculate_slab_order,6,40000,0

[ 0.000000] calculate_slab_order,7,80000,0

[ 0.000000] calculate_slab_order,8,100000,0

[ 0.000000] calculate_slab_order,9,0,1

[ 0.000000] -----[e],calculate_slab_order,9,0

[ 0.000000] -----[f],calculate_slab_order,400000,40,80002000,1,a,ff

[ 0.000000] calculate_slab_order,0,1000,0

[ 0.000000] calculate_slab_order,1,2000,0

[ 0.000000] calculate_slab_order,2,4000,0

[ 0.000000] calculate_slab_order,3,8000,0

[ 0.000000] calculate_slab_order,4,10000,0

[ 0.000000] calculate_slab_order,5,20000,0

[ 0.000000] calculate_slab_order,6,40000,0

[ 0.000000] calculate_slab_order,7,80000,0

[ 0.000000] calculate_slab_order,8,100000,0

[ 0.000000] calculate_slab_order,9,200000,0

[ 0.000000] calculate_slab_order,a,0,1

[ 0.000000] -----[e],calculate_slab_order,a,0

通过上面的打印信息可知,如果slab 对象的size很小,一般分配的一个物理页面即可(order=0),如果slab对象的size很大,一般分配的对象num为1,当然所需的物理页面也足够容纳对象size。

/*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
static void cache_estimate(unsigned long gfporder, size_t buffer_size,size_t align, int flags, size_t *left_over,unsigned int *num)
{
	int nr_objs;
	size_t mgmt_size;
	/*2^gfporder个页面总共大小*/
	size_t slab_size = PAGE_SIZE << gfporder;

	/*
	 * The slab management structure can be either off the slab or
	 * on it. For the latter case, the memory allocated for a
	 * slab is used for:
	 *
	 * - One unsigned int for each object
	 * - Padding to respect alignment of @align
	 * - @buffer_size bytes for each object
	 *
	 * If the slab management structure is off the slab, then the
	 * alignment will already be calculated into the size. Because
	 * the slabs are all pages aligned, the objects will be at the
	 * correct alignment when allocated.
	 */
	if (flags & CFLGS_OFF_SLAB) {
		mgmt_size = 0;
		/*无其他额外数据,所用页面都用来存放slab对象*/
		nr_objs = slab_size / buffer_size;
	} else {
		/*计算slab对象个数*/
		nr_objs = calculate_nr_objs(slab_size, buffer_size,sizeof(freelist_idx_t), align);
		/*计算freelist_size大小,mgmt_size就是每个obj对象需要占用一个 freelist_idx_t的空间*/
		mgmt_size = calculate_freelist_size(nr_objs, align);
	}
	/*slab对象的个数*/
	*num = nr_objs;
	/*剩余空间大小*/
	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}

static int calculate_nr_objs(size_t slab_size, size_t buffer_size,size_t idx_size, size_t align)
{
	int nr_objs;
	size_t remained_size;
	size_t freelist_size;
	int extra_space = 0;

	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
		extra_space = sizeof(char);
	/*
	 * Ignore padding for the initial guess. The padding
	 * is at most @align-1 bytes, and @buffer_size is at
	 * least @align. In the worst case, this result will
	 * be one greater than the number of objects that fit
	 * into the memory allocation when taking the padding
	 * into account.
	 */
	/*多占用idx_size,extra_space的空间*/
	nr_objs = slab_size / (buffer_size + idx_size + extra_space);

	/*
	 * This calculated number will be either the right
	 * amount, or one greater than what we want.
	 */
	remained_size = slab_size - nr_objs * buffer_size;
	/*freelist_size整体对齐*/
	freelist_size = calculate_freelist_size(nr_objs, align);
	if (remained_size < freelist_size)
		nr_objs--;

	return nr_objs;
}

static size_t calculate_freelist_size(int nr_objs, size_t align)
{
	size_t freelist_size;

	freelist_size = nr_objs * sizeof(freelist_idx_t);
	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
		freelist_size += nr_objs * sizeof(char);

	/*freelist_size整体对齐*/
	if (align)
		freelist_size = ALIGN(freelist_size, align);

	return freelist_size;
}

调用setup_cpu_cache来继续配置slab描述符。假设slab_state为FULL,即slab机制已经初始化完成,内部调用enable_cpucache继续进行处理。

setup_cpu_cache->enable_cpucache

/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
	int err;
	int limit = 0;
	int shared = 0;
	int batchcount = 0;

	if (!is_root_cache(cachep)) {
		struct kmem_cache *root = memcg_root_cache(cachep);
		limit = root->limit;
		shared = root->shared;
		batchcount = root->batchcount;
	}

	if (limit && shared && batchcount)
		goto skip_setup;
	/*
	 * The head array serves three purposes:
	 * - create a LIFO ordering, i.e. return objects that are cache-warm
	 * - reduce the number of spinlock operations.
	 * - reduce the number of linked list operations on the slab and
	 *   bufctl chains: array operations are cheaper.
	 * The numbers are guessed, we should auto-tune as described by
	 * Bonwick.
	 */
	/*根据slab对象大小设置limit值*/
	if (cachep->size > 131072) /*>128K*/
		limit = 1;
	else if (cachep->size > PAGE_SIZE) /*>4k*/
		limit = 8;
	else if (cachep->size > 1024) /*>1k*/
		limit = 24;
	else if (cachep->size > 256) /*>256*/
		limit = 54;
	else /*<256*/
		limit = 120;

	/*
	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
	 * allocation behaviour: Most allocs on one cpu, most free operations
	 * on another cpu. For these cases, an efficient object passing between
	 * cpus is necessary. This is provided by a shared array. The array
	 * replaces Bonwick's magazine layer.
	 * On uniprocessor, it's functionally equivalent (but less efficient)
	 * to a larger limit. Thus disabled by default.
	 */
	shared = 0;
	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
		shared = 8; /*slab obj对象size不超个PAGE_SIZE且为多核情况*/

#if DEBUG
	/*
	 * With debugging enabled, large batchcount lead to excessively long
	 * periods with disabled local interrupts. Limit the batchcount
	 */
	if (limit > 32)
		limit = 32;
#endif
	/*batchcount为limit的一半*/
	batchcount = (limit + 1) / 2;
skip_setup:
	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
	if (err)
		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
		       cachep->name, -err);
	return err;
}

继续调用do_tune_cpucahce来配置slab描述符。

do_tune_cpucache->__do_tune_cpucache

/* Always called with the slab_mutex held */
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,int batchcount, int shared, gfp_t gfp)
{
	struct array_cache __percpu *cpu_cache, *prev;
	int cpu;

	/*
		分配一个per-cpu变量,本地cpu缓存区;
		分配空间大小,size = sizeof(void *)*limit+sizeof(struct array_cache);
		注意,struct array_cache中的entry[]的大小和limit直接相关;
	*/
	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
	if (!cpu_cache)
		return -ENOMEM;

	prev = cachep->cpu_cache; /*保存cpu_cache旧值*/
	cachep->cpu_cache = cpu_cache;
	kick_all_cpus_sync();

	check_irq_on();
	cachep->batchcount = batchcount;
	cachep->limit = limit;
	/*多核且slab obj对象size不超过page size情况下shared值为8*/
	cachep->shared = shared;

	if (!prev) /*每个slab描述符对应一个slab节点(kmem_cache_node数据结构)*/
		goto alloc_node;

	/*prev存在情况,即slab描述符中已经存在array_cache,理论上应该,需要destroy slab描述符*/
	for_each_online_cpu(cpu) {
		LIST_HEAD(list);
		int node;
		struct kmem_cache_node *n;
		/*获取遍历cpu对应的array_cache*/
		struct array_cache *ac = per_cpu_ptr(prev, cpu);

		node = cpu_to_mem(cpu);
		n = get_node(cachep, node);
		spin_lock_irq(&n->list_lock);
		free_block(cachep, ac->entry, ac->avail, node, &list);
		spin_unlock_irq(&n->list_lock);
		slabs_destroy(cachep, &list);
	}
	free_percpu(prev);

alloc_node:
	/*分配并初始化kmem_cache_node*/
	return alloc_kmem_cache_node(cachep, gfp);
}

include/linux/nodemask.h
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

/*
 * Array of node states.
 */
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
	[N_MEMORY] = { { [0] = 1UL } },
#endif
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};

/*遍历online状态位图*/
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

#define for_each_node_state(__node, __state) for_each_node_mask((__node), node_states[__state])
通过上面的定义可知,node_states[N_ONLINE] 的bit 0初始化为1,那么在遍历node_states[N_ONLINE]位图时,bit 0是有效的。

/*
 * This initializes kmem_cache_node or resizes various caches for all nodes.
 */
static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
	int node;
	struct kmem_cache_node *n;
	struct array_cache *new_shared;
	struct alien_cache **new_alien = NULL;

	printk("----[f]%s,%x \r\n",__func__,use_alien_caches);
	/*根据 node_states 位图配置,online状态中node=0是成立的*/
	for_each_online_node(node) { /*非NUMA结构下应该有一个node存在*/
		if (use_alien_caches) {
			new_alien = alloc_alien_cache(node, cachep->limit, gfp);
			if (!new_alien)
				goto fail;
		}

		new_shared = NULL;
		/*多核且slab obj对象size不超过page size情况下shared值为8*/
		if (cachep->shared) {
			/*分配并初始化array_cache 用于多核之间共享使用,这个batchcount值:0xbaadf00d*/
			new_shared = alloc_arraycache(node,cachep->shared*cachep->batchcount,0xbaadf00d, gfp);
			if (!new_shared) {
				free_alien_cache(new_alien);
				goto fail;
			}
		}

		n = get_node(cachep, node);
		if (n) {
			struct array_cache *shared = n->shared;
			LIST_HEAD(list);

			spin_lock_irq(&n->list_lock);

			if (shared)
				free_block(cachep, shared->entry,shared->avail, node, &list);

			n->shared = new_shared;
			if (!n->alien) {
				n->alien = new_alien;
				new_alien = NULL;
			}
			n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
			spin_unlock_irq(&n->list_lock);
			slabs_destroy(cachep, &list);
			kfree(shared);
			free_alien_cache(new_alien);
			continue;
		}
		/*分配kmem_cache_node*/
		n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
		if (!n) {
			free_alien_cache(new_alien);
			kfree(new_shared);
			goto fail;
		}

		/*初始化kmem_cache_node*/
		kmem_cache_node_init(n);
		/*周期后台回收 4HZ*/
		n->next_reap = jiffies + REAPTIMEOUT_NODE + ((unsigned long)cachep) % REAPTIMEOUT_NODE;
		/*分配slab节点上的共享缓存区,共享缓存区中的obj对象什么是否分配?*/
		n->shared = new_shared;
		n->alien = new_alien;
		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
		cachep->node[node] = n; /*注意存放位置*/
	}
	return 0;

fail:
	if (!cachep->list.next) {
		/* Cache is not active yet. Roll back what we did */
		node--;
		while (node >= 0) {
			n = get_node(cachep, node);
			if (n) {
				kfree(n->shared);
				free_alien_cache(n->alien);
				kfree(n);
				cachep->node[node] = NULL;
			}
			node--;
		}
	}
	return -ENOMEM;
}

static void kmem_cache_node_init(struct kmem_cache_node *parent)
{
	INIT_LIST_HEAD(&parent->slabs_full);
	INIT_LIST_HEAD(&parent->slabs_partial);
	INIT_LIST_HEAD(&parent->slabs_free);

	parent->shared = NULL;
	parent->alien = NULL;
	parent->colour_next = 0;
	spin_lock_init(&parent->list_lock);
	parent->free_objects = 0;
	parent->free_touched = 0;
}

static struct array_cache __percpu *alloc_kmem_cache_cpus(struct kmem_cache *cachep, int entries, int batchcount)
{
	int cpu;
	size_t size;
	struct array_cache __percpu *cpu_cache;

	size = sizeof(void *) * entries + sizeof(struct array_cache);
	/*分配一个per-cpu变量*/
	cpu_cache = __alloc_percpu(size, sizeof(void *));

	if (!cpu_cache)
		return NULL;

	for_each_possible_cpu(cpu) {
		init_arraycache(per_cpu_ptr(cpu_cache, cpu),entries, batchcount);
	}

	return cpu_cache;
}

static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
	/*
	 * The array_cache structures contain pointers to free object.
	 * However, when such objects are allocated or transferred to another
	 * cache the pointers are not cleared and they could be counted as
	 * valid references during a kmemleak scan. Therefore, kmemleak must
	 * not scan such objects.
	 */
	kmemleak_no_scan(ac);
	if (ac) {
		ac->avail = 0; /*新创建的slab描述符中的arrar_cache中的avail值为0*/
		ac->limit = limit;
		ac->batchcount = batch;
		ac->touched = 0;
	}
}

2、分配slab对象

kmem_cache_alloc是分配slab缓存对象的核心函数,在slab分配过程中是全程关闭本地中断。


/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.  The flags are only relevant
 * if the cache has no available objects.
 */
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	void *ret = slab_alloc(cachep, flags, _RET_IP_);

	trace_kmem_cache_alloc(_RET_IP_, ret,cachep->object_size, cachep->size, flags);

	return ret;
}

static __always_inline void * slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
	unsigned long save_flags;
	void *objp;

	flags &= gfp_allowed_mask;

	lockdep_trace_alloc(flags);

	if (slab_should_failslab(cachep, flags))
		return NULL;

	cachep = memcg_kmem_get_cache(cachep, flags);

	cache_alloc_debugcheck_before(cachep, flags);
	local_irq_save(save_flags); /*关闭本地cpu中断*/
	/*分配slab对象*/
	objp = __do_cache_alloc(cachep, flags);
	local_irq_restore(save_flags); /*开启本地cpu中断*/
	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,flags);
	prefetchw(objp);

	if (likely(objp)) {
		kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
		if (unlikely(flags & __GFP_ZERO))
			memset(objp, 0, cachep->object_size);
	}

	memcg_kmem_put_cache(cachep);
	return objp;
}

static __always_inline void * __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	return ____cache_alloc(cachep, flags);
}

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	void *objp;
	struct array_cache *ac;
	bool force_refill = false;

	check_irq_off();

	/*
		获取本地cpu的per-cpu变量array_cache;
		疑问,slab描述符是在cpu1上创建的,且在cpu1上已经完成slab对象的分配;现在在cpu2上执行alloc分配操作结果如何?
		此时cpu2本地缓存中肯定是没有slab对象的,如果是多核情况则查看共享缓存区中是否有slab对象,共享缓存区有则从共享缓存区分配batchcount个对象到
		cpu2本地缓存区中,如果共享缓存区无则查看slab节点(kmem_cache_node)的slab部分/空闲链表中是否存在有效物理页面,如果存在则从物理页面从分配
		slab对象,如果无则只能借助伙伴系统来分配物理页面再从物理页面中分配batchcount个对象到cpu2本地缓存区中用于分配使用。
	*/
	ac = cpu_cache_get(cachep);
	if (likely(ac->avail)) { /*缓存池中存在可分配对象,但是新创建的slab描述符中的avail应该是0*/
		ac->touched = 1;
		objp = ac_get_obj(cachep, ac, flags, false);

		/*
		 * Allow for the possibility all avail objects are not allowed
		 * by the current flags
		 */
		if (objp) {
			STATS_INC_ALLOCHIT(cachep);
			goto out;
		}
		/*获取失败,需要填充本地缓冲区,设置force_refill为true*/
		force_refill = true;
	}

	STATS_INC_ALLOCMISS(cachep);
	/*缓存池中无可用对象用于分配*/
	objp = cache_alloc_refill(cachep, flags, force_refill);
	/*
	 * the 'ac' may be updated by cache_alloc_refill(),
	 * and kmemleak_erase() requires its correct value.
	 */
	ac = cpu_cache_get(cachep);

out:
	/*
	 * To avoid a false negative, if an object that is in one of the
	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
	 * treat the array pointers as a reference to the object.
	 */
	if (objp)
		kmemleak_erase(&ac->entry[ac->avail]);
	return objp;
}

static inline void *ac_get_obj(struct kmem_cache *cachep,struct array_cache *ac, gfp_t flags, bool force_refill)
{
	void *objp;

	if (unlikely(sk_memalloc_socks()))
		objp = __ac_get_obj(cachep, ac, flags, force_refill);
	else
		objp = ac->entry[--ac->avail]; /*直接从entry[]中获取*/

	return objp;
}

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,bool force_refill)
{
	int batchcount;
	struct kmem_cache_node *n;
	struct array_cache *ac;
	int node;

	check_irq_off();
	node = numa_mem_id();
	if (unlikely(force_refill))
		goto force_grow;
retry:

	ac = cpu_cache_get(cachep);
	batchcount = ac->batchcount;
	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
		/*
		 * If there was little recent activity on this cache, then
		 * perform only a partial refill.  Otherwise we could generate
		 * refill bouncing.
		 */
		batchcount = BATCHREFILL_LIMIT;
	}
	/*获取kmem_cache_node节点*/
	n = get_node(cachep, node);

	BUG_ON(ac->avail > 0 || !n);
	spin_lock(&n->list_lock);

	/* See if we can refill from the shared array */
	/*
	支持shared则从shared中拷贝batchcount个 slab对象到ac中;
	kmem_cache_node中shared在什么地方进行设置? 在kmem_cache_free中设置,free的slab对象回收到cpu本地缓存区,
                如果本地缓存区数量超过limit值则移动batchcount个obj对象到共享缓存区中。
	*/
	if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
		n->shared->touched = 1;
		goto alloc_done;
	}

	/*共享缓存区无obj对象则从slab节点的slab部分/空闲链表中通过物理页面来分配*/	
	while (batchcount > 0) {
		struct list_head *entry;
		struct page *page;
		/* Get slab alloc is to come from. */
		entry = n->slabs_partial.next;
		if (entry == &n->slabs_partial) {
			n->free_touched = 1; /*node的slab部分链表为空*/
			entry = n->slabs_free.next;
			if (entry == &n->slabs_free) /*node的slab空闲链表也为空,则必须从伙伴系统中进行物理页面的分配*/
				goto must_grow;
		}

		/*从slab链表中获取物理页面进行obj对象的分配*/
		page = list_entry(entry, struct page, lru);
		check_spinlock_acquired(cachep);

		/*
		 * The slab was either on partial or free list so
		 * there must be at least one object available for
		 * allocation.
		 */
		BUG_ON(page->active >= cachep->num); /*上述两个链表中不应该出现page中无空间分配slab obj对象的情况*/

		/*每次从kmem_cache_node中最大分配batchcount个slab obj对象到本地cpu slab缓存区entry[]中*/
		while (page->active < cachep->num && batchcount--) {
			STATS_INC_ALLOCED(cachep);
			STATS_INC_ACTIVE(cachep);
			STATS_SET_HIGH(cachep);

			/*将slab obj对象放到本地cpu缓存区entry[]中*/
			ac_put_obj(cachep, ac, slab_get_obj(cachep, page,node)/*从slab物理页面中分配obj对象*/);
		}

		/* move slabp to correct slabp list: */
		list_del(&page->lru);
		if (page->active == cachep->num) /*物理页面空间已经全部用于slab obj对象分配,则将其挂接到slabs_full链表上*/
			list_add(&page->lru, &n->slabs_full);
		else
			list_add(&page->lru, &n->slabs_partial); /*物理页面空间部分用于slab obj对象分配,则将其挂接到slabs_partial链表上*/
	}

must_grow:
	n->free_objects -= ac->avail;
alloc_done:
	spin_unlock(&n->list_lock);

	if (unlikely(!ac->avail)) { /*当前cpu中的本地缓存区array_cache中无可用slab对象*/
		int x;
force_grow:
		/*伙伴系统分配物理页面用于slab分配器*/
		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

		/* cache_grow can reenable interrupts, then ac could change. */
		/*获取本地cpu的array_cache*/
		ac = cpu_cache_get(cachep);
		node = numa_mem_id();

		/* no objects in sight? abort */
		if (!x && (ac->avail == 0 || force_refill))
			return NULL;

		/*
		前面cache_grow已经分配了物理页面但是并未添加到本地缓存中,所以本地缓存中还是无可用的obj对象用于分配,需要跳转到retry,
		 分配batchcount个obj对象到cpu本地缓存区中。
		*/
		if (!ac->avail)		/* objects refilled by interrupt? */
			goto retry;
	}
	ac->touched = 1;

	/*从本地缓存区中获取obj对象*/
	return ac_get_obj(cachep, ac, flags, force_refill);
}

static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
{
	return ((freelist_idx_t *)page->freelist)[idx];
}

static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
				 unsigned int idx)
{
	return page->s_mem + cache->size * idx;
}

/*obj对象在整个slab对象中的index索引号*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,const struct page *page, void *obj)
{
	u32 offset = (obj - page->s_mem); /*偏移量*/
	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}


/*从slab物理页面中分配obj对象*/
static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,int nodeid)
{
	void *objp;

	/*从分配的物理页面中分配一个obj对象*/
	objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); /*page->s_mem + cache->size * idx*/
	page->active++; /*active标识物理已经分配的obj对象个数或者活跃obj对象个数*/
#if DEBUG
	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
#endif

	return objp;
}

/*将obj对象释放会slab物理页面*/
static void slab_put_obj(struct kmem_cache *cachep, struct page *page,void *objp, int nodeid)
{
	/*obj对象在整个slab中的index索引号*/
	unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
	unsigned int i;

	/* Verify that the slab belongs to the intended node */
	WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);

	/* Verify double free bug */
	for (i = page->active; i < cachep->num; i++) {
		if (get_free_obj(page, i) == objnr) {
			printk(KERN_ERR "slab: double free detected in cache "
					"'%s', objp %p\n", cachep->name, objp);
			BUG();
		}
	}
#endif
	page->active--;
	set_free_obj(page, page->active, objnr);
}


/*将slab obj对象放到本地cpu缓存区中*/
static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,void *objp)
{
	if (unlikely(sk_memalloc_socks()))
		objp = __ac_put_obj(cachep, ac, objp);

	/*将slab obj对象放到本地cpu缓存区中*/
	ac->entry[ac->avail++] = objp;
}


/*从本地cpu缓存区中获取slab obj对象*/
static inline void *ac_get_obj(struct kmem_cache *cachep,struct array_cache *ac, gfp_t flags, bool force_refill)
{
	void *objp;

	if (unlikely(sk_memalloc_socks()))
		objp = __ac_get_obj(cachep, ac, flags, force_refill);
	else
		objp = ac->entry[--ac->avail]; /*直接从entry[]中获取*/

	return objp;
}

/*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
 *
 * Return the number of entries transferred.
 */
static int transfer_objects(struct array_cache *to,struct array_cache *from, unsigned int max)
{
	/* Figure out how many entries to transfer */
	int nr = min3(from->avail, max, to->limit - to->avail);

	if (!nr)
		return 0;

	/*从from中拷贝对象到to中*/
	memcpy(to->entry + to->avail, from->entry + from->avail -nr,sizeof(void *) *nr);

	from->avail -= nr;
	to->avail += nr;
	return nr;
}

/*
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static int cache_grow(struct kmem_cache *cachep,gfp_t flags, int nodeid, struct page *page)
{
	void *freelist;
	size_t offset;
	gfp_t local_flags;
	struct kmem_cache_node *n;

	/*
	 * Be lazy and only check for valid flags here,  keeping it out of the
	 * critical path in kmem_cache_alloc().
	 */
	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
		pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
		BUG();
	}
	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

	/* Take the node list lock to change the colour_next on this node */
	check_irq_off();
	n = get_node(cachep, nodeid);
	spin_lock(&n->list_lock);

	/* Get colour for the slab, and cal the next value. */
	/*slab分配器中着色的处理*/
	offset = n->colour_next;
	n->colour_next++;
	if (n->colour_next >= cachep->colour)
		n->colour_next = 0;
	spin_unlock(&n->list_lock);

	/*着色本质是将空闲空间前移,错开cache line颠簸*/
	offset *= cachep->colour_off; /*colour_off大小为cache line的大小*/

	if (local_flags & __GFP_WAIT)
		local_irq_enable();

	/*
	 * The test for missing atomic flag is performed here, rather than
	 * the more obvious place, simply to reduce the critical path length
	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
	 * will eventually be caught here (where it matters).
	 */
	kmem_flagcheck(cachep, flags);

	/*
	 * Get mem for the objs.  Attempt to allocate a physical page from
	 * 'nodeid'.
	 */
	if (!page) /*从伙伴系统中分配物理页面*/
		page = kmem_getpages(cachep, local_flags, nodeid);
	if (!page)
		goto failed;

	/* Get slab management. */
	/*整个slab物理页面的布局管理*/
	freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
	if (!freelist)
		goto opps1;

	slab_map_pages(cachep, page, freelist);
	/*设备每个obj对象*/
	cache_init_objs(cachep, page);

	if (local_flags & __GFP_WAIT)
		local_irq_disable();
	check_irq_off();
	spin_lock(&n->list_lock);

	/* Make slab active. */
	/*将page添加到slabs_free链表中*/
	list_add_tail(&page->lru, &(n->slabs_free));
	STATS_INC_GROWN(cachep);
	/*node中空闲free_objects数量增加*/
	n->free_objects += cachep->num;
	spin_unlock(&n->list_lock);
	return 1;
opps1:
	kmem_freepages(cachep, page);
failed:
	if (local_flags & __GFP_WAIT)
		local_irq_disable();
	return 0;
}

kmem_getpages从伙伴系统中分配slab描述符所需的物理页面数量。
/*
 * Interface to system's page allocator. No need to hold the
 * kmem_cache_node ->list_lock.
 *
 * If we requested dmaable memory, we will get it. Even if we
 * did not request dmaable memory, we might get it, but that
 * would be relatively rare and ignorable.
 */
static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,int nodeid)
{
	struct page *page;
	int nr_pages;

	/*
	注意,slab描述符中 cachep->allocflags默认设置为__GFP_COMP;
	该标志使得在prep_compound_page 中slab物理页面中后面每个物理页面都能通过p->first_page 找头物理页面
	*/
	flags |= cachep->allocflags;
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		flags |= __GFP_RECLAIMABLE;

	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
		return NULL;

	/*调用__alloc_pages 分配物理页面,分配完物理页面最后检查时的prep_compound_page中会处理__GFP_COMP标志*/
	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
	if (!page) {
		memcg_uncharge_slab(cachep, cachep->gfporder);
		slab_out_of_memory(cachep, flags, nodeid);
		return NULL;
	}

	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
	if (unlikely(page->pfmemalloc))
		pfmemalloc_active = true;

	nr_pages = (1 << cachep->gfporder);
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		add_zone_page_state(page_zone(page),
			NR_SLAB_RECLAIMABLE, nr_pages);
	else
		add_zone_page_state(page_zone(page),
			NR_SLAB_UNRECLAIMABLE, nr_pages);
	__SetPageSlab(page);
	if (page->pfmemalloc)
		SetPageSlabPfmemalloc(page);

	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);

		if (cachep->ctor)
			kmemcheck_mark_uninitialized_pages(page, nr_pages);
		else
			kmemcheck_mark_unallocated_pages(page, nr_pages);
	}

	return page;
}


/*
 * Get the memory for a slab management obj.
 *
 * For a slab cache when the slab descriptor is off-slab, the
 * slab descriptor can't come from the same cache which is being created,
 * Because if it is the case, that means we defer the creation of
 * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
 * And we eventually call down to __kmem_cache_create(), which
 * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
 * This is a "chicken-and-egg" problem.
 *
 * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
 * which are all initialized during kmem_cache_init().
 */
static void *alloc_slabmgmt(struct kmem_cache *cachep,
				   struct page *page, int colour_off,
				   gfp_t local_flags, int nodeid)
{
	void *freelist;

	/*page物理页面对应的虚拟地址*/
	void *addr = page_address(page);

	if (OFF_SLAB(cachep)) {
		/* Slab management obj is off-slab. */
		freelist = kmem_cache_alloc_node(cachep->freelist_cache,local_flags, nodeid);
		if (!freelist)
			return NULL;
	} else {
		freelist = addr + colour_off; /*着色偏移后面存放freelist管理数据*/
		colour_off += cachep->freelist_size;
	}
	/*active设置为0*/
	page->active = 0;
	/*在addr基础上做偏移(着色)后才使用该地址,并不是每个slab对象需要偏移,而是每次进行slab整体空间分配时才需要*/
	page->s_mem = addr + colour_off;
	return freelist;
}

 include/linux/mm.h
#define page_address(page) lowmem_page_address(page)

static __always_inline void *lowmem_page_address(const struct page *page)
{
	return __va(PFN_PHYS(page_to_pfn(page)));
}

#define page_to_pfn __page_to_pfn
#define pfn_to_page __pfn_to_page

#define __pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))

#define __page_to_pfn(page)	((unsigned long)((page) - mem_map) + ARCH_PFN_OFFSET)

/*页框号对应的物理地址*/
#define PFN_PHYS(x)	((phys_addr_t)(x) << PAGE_SHIFT)

/*
 * Map pages beginning at addr to the given cache and slab. This is required
 * for the slab allocator to be able to lookup the cache and slab of a
 * virtual address for kfree, ksize, and slab debugging.
 */
static void slab_map_pages(struct kmem_cache *cache, struct page *page,void *freelist)
{
	/*page内部也存在数据结构 指向slab描述符*/
	page->slab_cache = cache;
	page->freelist = freelist;
}

/*page为第一个物理页面*/
static void cache_init_objs(struct kmem_cache *cachep,struct page *page)
{
	int i;

	for (i = 0; i < cachep->num; i++) {
		void *objp = index_to_obj(cachep, page, i); /*page->s_mem + cache->size*idx*/
#if DEBUG
		/* need to poison the objs? */
		if (cachep->flags & SLAB_POISON)
			poison_obj(cachep, objp, POISON_FREE);
		if (cachep->flags & SLAB_STORE_USER)
			*dbg_userword(cachep, objp) = NULL;

		if (cachep->flags & SLAB_RED_ZONE) {
			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
		}
		/*
		 * Constructors are not allowed to allocate memory from the same
		 * cache which they are a constructor for.  Otherwise, deadlock.
		 * They must also be threaded.
		 */
		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
			cachep->ctor(objp + obj_offset(cachep));

		if (cachep->flags & SLAB_RED_ZONE) {
			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				slab_error(cachep, "constructor overwrote the"
					   " end of an object");
			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				slab_error(cachep, "constructor overwrote the"
					   " start of an object");
		}
		if ((cachep->size % PAGE_SIZE) == 0 &&
			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
			kernel_map_pages(virt_to_page(objp),
					 cachep->size / PAGE_SIZE, 0);
#else
		if (cachep->ctor)
			cachep->ctor(objp);
#endif
		set_obj_status(page, i, OBJECT_FREE);
		set_free_obj(page, i, i);
	}
}

static inline void set_free_obj(struct page *page,unsigned int idx, freelist_idx_t val)
{
	((freelist_idx_t *)(page->freelist))[idx] = val;
}


3、释放slab对象
释放slab缓存对象使用kmem_cache_free接口。

/**
 * kmem_cache_free - Deallocate an object
 * @cachep: The cache the allocation was from.
 * @objp: The previously allocated object.
 *
 * Free an object which was previously allocated from this
 * cache.
 */
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
	unsigned long flags;
	/*获取obj对应的kmem_cache slab描述符*/
	cachep = cache_from_obj(cachep, objp);
	if (!cachep)
		return;

	local_irq_save(flags); /*关闭本地cpu中断*/
	debug_check_no_locks_freed(objp, cachep->object_size);
	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
		debug_check_no_obj_freed(objp, cachep->object_size);
	/*释放obj对象*/
	__cache_free(cachep, objp, _RET_IP_);
	local_irq_restore(flags); /*开启本地cpu中断*/

	trace_kmem_cache_free(_RET_IP_, objp);
}


/*
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
static inline void __cache_free(struct kmem_cache *cachep, void *objp,unsigned long caller)
{
	/*获取本地cpu obj对象缓存区*/
	struct array_cache *ac = cpu_cache_get(cachep);

	check_irq_off();
	kmemleak_free_recursive(objp, cachep->flags);
	objp = cache_free_debugcheck(cachep, objp, caller);

	kmemcheck_slab_free(cachep, objp, cachep->object_size);

	/*
	 * Skip calling cache_free_alien() when the platform is not numa.
	 * This will avoid cache misses that happen while accessing slabp (which
	 * is per page memory  reference) to get nodeid. Instead use a global
	 * variable to skip the call, which is mostly likely to be present in
	 * the cache.
	 */
	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
		return;

	if (ac->avail < ac->limit) { /*小于限制值,直接回收到本地缓存区*/
		STATS_INC_FREEHIT(cachep);
	} else {
		STATS_INC_FREEMISS(cachep); /*超高限制值,需要将batchcount个obj对象放回共享缓存区*/
		cache_flusharray(cachep, ac);
	}

	/*将slab obj对象放到本地cpu缓存区entry[]中*/
	ac_put_obj(cachep, ac, objp);
}

static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
	int batchcount;
	struct kmem_cache_node *n;
	int node = numa_mem_id();
	LIST_HEAD(list);

	batchcount = ac->batchcount;
#if DEBUG
	BUG_ON(!batchcount || batchcount > ac->avail);
#endif
	check_irq_off();
	/*slab节点*/
	n = get_node(cachep, node);
	spin_lock(&n->list_lock);
	if (n->shared) {
		struct array_cache *shared_array = n->shared;
		int max = shared_array->limit - shared_array->avail;
		if (max) { /*avail < limit则继续往共享缓存区中存放obj对象*/
			if (batchcount > max)
				batchcount = max;
			/*将数据从本地cpuu缓存区拷贝到slab节点 sharedan共享缓存区中*/
			memcpy(&(shared_array->entry[shared_array->avail]),ac->entry, sizeof(void *) * batchcount);
			shared_array->avail += batchcount;
			goto free_done;
		}
	}

	/*avail等于limit,共享缓存区无法存放更多的obj对象,只能释放batchount个本地缓存区obj对象回物理页面中*/
	free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
	{
		int i = 0;
		struct list_head *p;

		p = n->slabs_free.next;
		while (p != &(n->slabs_free)) {
			struct page *page;

			page = list_entry(p, struct page, lru);
			BUG_ON(page->active);

			i++;
			p = p->next;
		}
		STATS_SET_FREEABLE(cachep, i);
	}
#endif
	spin_unlock(&n->list_lock);
	/*如果存在整个slab的物理页面中无活跃的obj对象则释放整个物理页面,被释放的物理页面page挂接在list上,最后伙伴系统释放物理页面*/
	slabs_destroy(cachep, &list);
	ac->avail -= batchcount;
	/*填充被移动到shared共享缓存区的对象留下的空位*/
	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}


/*
 * Caller needs to acquire correct kmem_cache_node's list_lock
 * @list: List of detached free slabs should be freed by caller
 */
static void free_block(struct kmem_cache *cachep, void **objpp,int nr_objects, int node, struct list_head *list)
{
	int i;
	struct kmem_cache_node *n = get_node(cachep, node);

	for (i = 0; i < nr_objects; i++) {
		void *objp;
		struct page *page;

		clear_obj_pfmemalloc(&objpp[i]);
		objp = objpp[i];

		/*obj对象对应的首物理页面*/
		page = virt_to_head_page(objp);
		list_del(&page->lru); /*从链表上删除页表*/
		check_spinlock_acquired_node(cachep, node);
		slab_put_obj(cachep, page, objp, node);
		STATS_DEC_ACTIVE(cachep);
		n->free_objects++;

		/* fixup slab chains */
		if (page->active == 0) { /*物理页面中无活跃的obj对象,即整个物理页面可以进行回收*/
			if (n->free_objects > n->free_limit) {
				n->free_objects -= cachep->num;
				list_add_tail(&page->lru, list); /*添加到list上*/
			} else {
				list_add(&page->lru, &n->slabs_free);
			}
		} else {
			/* Unconditionally move a slab to the end of the
			 * partial list on free - maximum time for the
			 * other objects to be freed, too.
			 */
			list_add_tail(&page->lru, &n->slabs_partial);
		}
	}
}

static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
	struct page *page, *n;

	list_for_each_entry_safe(page, n, list, lru) {
		list_del(&page->lru); /*物理页面从list上删除*/
		slab_destroy(cachep, page); /*伙伴系统回收物理页面*/
	}
}

/**
 * slab_destroy - destroy and release all objects in a slab
 * @cachep: cache pointer being destroyed
 * @page: page pointer being destroyed
 *
 * Destroy all the objs in a slab page, and release the mem back to the system.
 * Before calling the slab page must have been unlinked from the cache. The
 * kmem_cache_node ->list_lock is not held/needed.
 */
static void slab_destroy(struct kmem_cache *cachep, struct page *page)
{
	void *freelist;

	freelist = page->freelist;
	slab_destroy_debugcheck(cachep, page);
	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
		struct rcu_head *head;

		/*
		 * RCU free overloads the RCU head over the LRU.
		 * slab_page has been overloeaded over the LRU,
		 * however it is not used from now on so that
		 * we can use it safely.
		 */
		head = (void *)&page->rcu_head;
		call_rcu(head, kmem_rcu_free);

	} else { /*通过伙伴系统释放物理页面*/
		kmem_freepages(cachep, page);
	}

	/*
	 * From now on, we don't use freelist
	 * although actual page can be freed in rcu context
	 */
	if (OFF_SLAB(cachep)) /*释放freelist的空间*/
		kmem_cache_free(cachep->freelist_cache, freelist);
}

注意,最开始存放到本次cpu对象缓存区的对象是按照虚拟地址递增的顺序存放batchcount个对象。经过中间无数次的分配和回收操作后,本地cpu对象缓存区entry[]中的
对象的虚拟地址就是乱序的,因此本地cpu对象缓存区回收对象时是直接存放到entry[avail++]中的,此时的虚拟地址毫无顺序可言。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值