http://blog.jobbole.com/91820/
这里只说物理内存管理 linux内核的,看了很多讲解的内存的东西,但是自己总结的时候总感觉无从下手,这里就从实际物理内存分配接口开始吧。
Kmalloc 它分配连续的物理内存空间 ,它不负责把分配的内存空间清零,它能分配多大的呢?并且它只能分配ZONE_NORMAL的不能分配dma和high里的,也就是只分配低端内存.一般情况下内存被分为三个zone:NORMAL、DMA、HIGH.
这个函数是建立在slab分配器的基础上的,通过cache 而cache有通过slab 分配obj 。
在开始分析kmalloc函数之前,我们需要说明一下linux内核物理内存的分配函数API:
__get_free_pages它会调用alloc_pages,它的特点是不能从HIGHMEM分配内存,分配2的幂个连续物理页面。它有简化模式(只分配一page)
__get_free_page,而get_zeroed_page接口分配的页面内容对应填充为0. 从dma分配可以调用__get_dma_pages(它本质也是调用__get_free_pages)
那么终极接口alloc_pages它可以从任何zone里申请内存,当然前提设置对应的flags.
参考内核:linux3.18.13
参考书籍:《linux内核设计与实现》《linux设备驱动程序》《深入理解linux设备驱动内核机制》
下面我们就说说kmalloc:(关于分配时候的flags这里不讨论,具体可以参考资料)
我们先看头文件
#include
而关于它的具体实现我们看slab.h中
1
2
3
4
5
6
7
|
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
#include <linux/slab_def.h>
#endif
|
一般系统默认#include
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
static
__always_inline
void
*kmalloc(
size_t
size, gfp_t flags)
{
struct
kmem_cache *cachep;
void
*ret;
if
(__builtin_constant_p(size)) {
int
i = 0;
if
(!size)
return
ZERO_SIZE_PTR;
#define CACHE(x) \
if
(size <= x) \
goto
found; \
else
\
i++;
#include <linux/kmalloc_sizes.h> //这里查询申请的size在哪个范围 从32乘2递增。I每次加1.
#undef CACHE
return
NULL;
found:
#ifdef CONFIG_ZONE_DMA
if
(flags & GFP_DMA)
cachep = malloc_sizes[i].cs_dmacachep;
//很明显如果定义了dma,并且设置了dma标志则优先从dma cache里申请。malloc_sizes的初始化在slab.c里。可以具体分析一下。
else
#endif
cachep = malloc_sizes[i].cs_cachep;
//从指定的cache链表分配内存,不浪费空间。
ret = kmem_cache_alloc_trace(cachep, flags, size);
return
ret;
}
return
__kmalloc(size, flags);
}
|
这里可以补充下代码关于kmalloc_sizes.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#if (PAGE_SIZE == 4096)
CACHE(32)
#endif
CACHE(64)
#if L1_CACHE_BYTES < 64
CACHE(96)
#endif
CACHE(128)
#if L1_CACHE_BYTES < 128
CACHE(192)
#endif
CACHE(256)
CACHE(512)
CACHE(1024)
CACHE(2048)
CACHE(4096)
CACHE(8192)
CACHE(16384)
CACHE(32768)
CACHE(65536)
CACHE(131072)
#if KMALLOC_MAX_SIZE >= 262144
CACHE(262144)
#endif
#if KMALLOC_MAX_SIZE >= 524288
CACHE(524288)
#endif
#if KMALLOC_MAX_SIZE >= 1048576
CACHE(1048576)
#endif
#if KMALLOC_MAX_SIZE >= 2097152
CACHE(2097152)
#endif
#if KMALLOC_MAX_SIZE >= 4194304
CACHE(4194304)
#endif
#if KMALLOC_MAX_SIZE >= 8388608
CACHE(8388608)
#endif
#if KMALLOC_MAX_SIZE >= 16777216
CACHE(16777216)
#endif
#if KMALLOC_MAX_SIZE >= 33554432
CACHE(33554432)
#endif
|
我们看到函数开头需要说明一下:
__builtin_constant_p 是编译器gcc内置函数,用于判断一个值是否为编译时常量,如果是常数,函数返回1 ,否则返回0。此内置函数的典型用法是在宏中用于手动编译时优化显然如果size为常数 则用__kmalloc(size, flags);申请内存.
它查询需要分配的内存在哪个系统cache然后调用
1
2
3
4
5
6
7
8
9
|
#ifdef CONFIG_TRACING
extern
void
*kmem_cache_alloc_trace(
struct
kmem_cache *, gfp_t,
size_t
);
#else
static
__always_inline
void
*
kmem_cache_alloc_trace(
struct
kmem_cache *cachep, gfp_t flags,
size_t
size)
{
return
kmem_cache_alloc(cachep, flags);
}
#endif
|
我们看具体代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
/**
* kmem_cache_alloc - Allocate an object
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
*/
void
*kmem_cache_alloc(
struct
kmem_cache *cachep, gfp_t flags)
{
void
*ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
// 跟踪调试会用到
cachep->object_size, cachep->size, flags);
return
ret;
}
|
它实际的分配是slab_alloc:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
static
__always_inline
void
*
slab_alloc(
struct
kmem_cache *cachep, gfp_t flags, unsigned
long
caller)
{
unsigned
long
save_flags;
void
*objp;
flags &= gfp_allowed_mask;
// 说明在gfp.h中 ,如下
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
* GFP flags are used before interrupts are enabled. Once interrupts are
* enabled, it is set to __GFP_BITS_MASK while the system is running. During
* hibernation, it is used by PM to avoid I/O during memory allocation while
* devices are suspended.
*/
extern
gfp_t gfp_allowed_mask;
lockdep_trace_alloc(flags);
// 调试用
if
(slab_should_failslab(cachep, flags))
return
NULL;
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
prefetchw(objp);
if
(likely(objp))
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
if
(unlikely((flags & __GFP_ZERO) && objp))
memset
(objp, 0, cachep->object_size);
return
objp;
}
|
它调用objp = __do_cache_alloc(cachep, flags); 除了检查一些标志等继续调用
____cache_alloc(cachep, flags);
它是一个统一的接口 (有检测numa和uma ,linux默认是uma 除非指定了numa)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
static
inline
void
*____cache_alloc(
struct
kmem_cache *cachep, gfp_t flags)
{
void
*objp;
struct
array_cache *ac;
bool
force_refill =
false
;
check_irq_off();
ac = cpu_cache_get(cachep);
if
(likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags,
false
);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if
(objp) {
STATS_INC_ALLOCHIT(cachep);
goto
out;
}
force_refill =
true
;
}
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags, force_refill);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if
(objp)
kmemleak_erase(&ac->entry[ac->avail]);
return
objp;
}
|
这里我们假定是第一次使用分配内存,那么根据在kmem_cache_init中的malloc_sizes[]的初始化,在kmalloc的时候返回的kmalloc_cache指针指向的cache中用到这样个函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
static
int
__init_refok setup_cpu_cache(
struct
kmem_cache *cachep, gfp_t gfp)
{
if
(slab_state >= FULL)
return
enable_cpucache(cachep, gfp);
if
(slab_state == DOWN) {
/*
* Note: Creation of first cache (kmem_cache).
* The setup_list3s is taken care
* of by the caller of __kmem_cache_create
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
slab_state = PARTIAL;
}
else
if
(slab_state == PARTIAL) {
/*
* Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
* the second cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
set_up_list3s(cachep, SIZE_AC);
if
(INDEX_AC == INDEX_L3)
slab_state = PARTIAL_L3;
else
slab_state = PARTIAL_ARRAYCACHE;
}
else
{
/* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(
sizeof
(
struct
arraycache_init), gfp);
if
(slab_state == PARTIAL_ARRAYCACHE) {
set_up_list3s(cachep, SIZE_L3);
slab_state = PARTIAL_L3;
}
else
{
int
node;
for_each_online_node(node) {
cachep->nodelists[node] =
kmalloc_node(
sizeof
(
struct
kmem_list3),
gfp, node);
BUG_ON(!cachep->nodelists[node]);
kmem_list3_init(cachep->nodelists[node]);
}
}
}
cachep->nodelists[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned
long
)cachep) % REAPTIMEOUT_LIST3;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
// 1
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return
0;
}
|
我们知道不论array被赋了什么值,最后都要初始化avail等值.
所以如果array不可用,那么就会调用;当然如果array可用那么直接返回申请的obj的内存指针.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
static
void
*cache_alloc_refill(
struct
kmem_cache *cachep, gfp_t flags,
bool
force_refill)
{
int
batchcount;
struct
kmem_list3 *l3;
struct
array_cache *ac;
int
node;
check_irq_off();
node = numa_mem_id();
if
(unlikely(force_refill))
goto
force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if
(!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
l3 = cachep->nodelists[node];
BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
if
(l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
l3->shared->touched = 1;
goto
alloc_done;
}
while
(batchcount > 0) {
struct
list_head *entry;
struct
slab *slabp;
/* Get slab alloc is to come from. */
entry = l3->slabs_partial.next;
if
(entry == &l3->slabs_partial) {
l3->free_touched = 1;
entry = l3->slabs_free.next;
if
(entry == &l3->slabs_free)
goto
must_grow;
}
slabp = list_entry(entry,
struct
slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse >= cachep->num);
while
(slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if
(slabp->
free
== BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);
if
(unlikely(!ac->avail)) {
int
x;
force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
// grow成功返回 1
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if
(!x && (ac->avail == 0 || force_refill))
return
NULL;
if
(!ac->avail)
/* objects refilled by interrupt? */
goto
retry;
}
ac->touched = 1;
return
ac_get_obj(cachep, ac, flags, force_refill);
}
|
由于第一次使用nodelist上slab链表都为空,所以must_grow
它调用cache_grow,这个函数首先计算了slab着色处理。然后调用kmem_getpages申请page,大小根据cache->gfporder,它返回申请pages的虚拟地址.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static
int
cache_grow(
struct
kmem_cache *cachep,
gfp_t flags,
int
nodeid,
void
*objp)
{
struct
slab *slabp;
size_t
offset;
gfp_t local_flags;
struct
kmem_list3 *l3;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
BUG_ON(flags & GFP_SLAB_BUG_MASK);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/* Take the l3 list lock to change the colour_next on this node */
check_irq_off();
l3 = cachep->nodelists[nodeid];
spin_lock(&l3->list_lock);
/* Get colour for the slab, and cal the next value. */
offset = l3->colour_next;
// default 0
l3->colour_next++;
if
(l3->colour_next >= cachep->colour)
l3->colour_next = 0;
spin_unlock(&l3->list_lock);
offset *= cachep->colour_off;
// first time ,offset is 0 ;
if
(local_flags & __GFP_WAIT)
local_irq_enable();
/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);
/*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
if
(!objp)
objp = kmem_getpages(cachep, local_flags, nodeid);
if
(!objp)
goto
failed;
/* Get slab management. */
slabp = alloc_slabmgmt(cachep, objp, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
if
(!slabp)
goto
opps1;
slab_map_pages(cachep, slabp, objp);
cache_init_objs(cachep, slabp);
if
(local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
spin_lock(&l3->list_lock);
/* Make slab active. */
list_add_tail(&slabp->list, &(l3->slabs_free));
// 把新申请的slab添加到nodelist的slabs_free链表。
STATS_INC_GROWN(cachep);
l3->free_objects += cachep->num;
//初始化可用的对象即每个slab可以包含的obj数目
spin_unlock(&l3->list_lock);
return
1;
opps1:
kmem_freepages(cachep, objp);
failed:
if
(local_flags & __GFP_WAIT)
local_irq_disable();
return
0;
}
|
而关于slab着色跟硬件缓冲有关,为了尽量避免缓存冲突不命中问题,提高效率(cache_line问题)。可以参考《深入理解计算机系统》。
具体操作见:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
/*
* Get the memory for a slab management obj.
* For a slab cache when the slab descriptor is off-slab, slab descriptors
* always come from malloc_sizes caches. The slab descriptor cannot
* come from the same cache which is getting created because,
* when we are searching for an appropriate cache for these
* descriptors in kmem_cache_create, we search through the malloc_sizes array.
* If we are creating a malloc_sizes cache here it would not be visible to
* kmem_find_general_cachep till the initialization is complete.
* Hence we cannot have slabp_cache same as the original cache.
*/
static
struct
slab *alloc_slabmgmt(
struct
kmem_cache *cachep,
void
*objp,
int
colour_off, gfp_t local_flags,
int
nodeid)
{
struct
slab *slabp;
if
(OFF_SLAB(cachep)) {
// 关于OFF_SLAB问题 可以看代码:
CFLGS_OFF_SLAB 在__kmem_cache_create
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if
((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
local_flags, nodeid);
/*
* If the first object in the slab is leaked (it's allocated
* but no one has a reference to it), we want to make sure
* kmemleak does not treat the ->s_mem pointer as a reference
* to the object. Otherwise we will not report the leak.
*/
kmemleak_scan_area(&slabp->list,
sizeof
(
struct
list_head),
local_flags);
if
(!slabp)
return
NULL;
}
else
{
slabp = objp + colour_off;
// 在__kmem_cache_create中cachep->colour_off = cache_line_size();
// 在cache.h中#define cache_line_size() L1_CACHE_BYTES; 一般为32B 大小.
// cachep->colour = left_over / cachep->colour_off;
colour_off += cachep->slab_size;
}
slabp->inuse = 0;
// num of objs active in slab
slabp->colouroff = colour_off;
//第一个obj相对page地址的偏移
slabp->s_mem = objp + colour_off;
//第一个obj的地址
slabp->nodeid = nodeid;
slabp->
free
= 0;
return
slabp;
}
|
我们看看另外一个很重要的操作:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
static
void
cache_init_objs(
struct
kmem_cache *cachep,
struct
slab *slabp)
{
int
i;
for
(i = 0; i < cachep->num; i++) {
void
*objp = index_to_obj(cachep, slabp, i);
#if DEBUG
/* need to poison the objs? */
if
(cachep->flags & SLAB_POISON)
poison_obj(cachep, objp, POISON_FREE);
if
(cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
if
(cachep->flags & SLAB_RED_ZONE) {
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
/*
* Constructors are not allowed to allocate memory from the same
* cache which they are a constructor for. Otherwise, deadlock.
* They must also be threaded.
*/
if
(cachep->ctor && !(cachep->flags & SLAB_POISON))
cachep->ctor(objp + obj_offset(cachep));
if
(cachep->flags & SLAB_RED_ZONE) {
if
(*dbg_redzone2(cachep, objp) != RED_INACTIVE)
slab_error(cachep,
"constructor overwrote the"
" end of an object"
);
if
(*dbg_redzone1(cachep, objp) != RED_INACTIVE)
slab_error(cachep,
"constructor overwrote the"
" start of an object"
);
}
if
((cachep->size % PAGE_SIZE) == 0 &&
OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
kernel_map_pages(virt_to_page(objp),
cachep->size / PAGE_SIZE, 0);
#else
if
(cachep->ctor)
cachep->ctor(objp);
// 根据构造函数初始化对象
#endif
slab_bufctl(slabp)[i] = i + 1;
// init bufctl数组 1、2、3、4 ..... 最后一个设置成为BUFCTL_END
}
slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}
|