内存管理器(十七)kernel内存管理----slab设计与实现(分配对象)
前言
上一篇主要写了slab的初始化与创建一个高速缓存。这一篇主要分析slab 是如何分配对象的。
void * kmem_cache_alloc(kmem_cache_t *cachep,gfp_t flags) 这个是主要分析的对象。
__start
[c]
/**
* kmem_cache_alloc - Allocate an object 分配一个对象
* @cachep: The cache to allocate from. 从哪一个缓存区分配
* @flags: See kmalloc(). ...这注释真是醉了,不过没关系,我在前边分析数据结构的时候已经分* 析过了,没看过的小伙伴去看看吧。
*
* Allocate an object from this cache. The flags are only relevant 分配一个对象从这个缓存区,如果没有空闲的对象那么这个标识也没有什么用。
* if the cache has no available objects.
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
/*一个追踪函数,调试时使用*/
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);
[/c]
[c]
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp; /*分配目标*/
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags); /*一个追踪函数*/
if (slab_should_failslab(cachep, flags)) /*检查这个指定的标志,是否会导致不可分配*/
return NULL;
cachep = memcg_kmem_get_cache(cachep, flags); /* 这个函数看下边第一个函数*/
cache_alloc_debugcheck_before(cachep, flags);/*这个函数由两个作用:1:将标志设置为可以等待的(__GFP_WAIT).2:如果定义了调试(if DEBUG)用来调试标记
local_irq_save(save_flags);/*本地中断中态保存保存*/
objp = __do_cache_alloc(cachep, flags); /*分配主要函数,看下边有详解*/
local_irq_restore(save_flags); /*恢复本地中断*/
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);/*DEBUG*/
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,flags);
prefetchw(objp);
if (likely(objp)) {
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
if (unlikely(flags & __GFP_ZERO))
memset(objp, 0, cachep->object_size);
}
memcg_kmem_put_cache(cachep);
return objp;
}
[/c]
各个函数分析
1.memcg_kmem_get_cache( )
[c]
/**
* memcg_kmem_get_cache: selects the correct per-memcg cache for allocation 从per-memcg 中选择一个正确的高* * 速缓存来分配内存
* @cachep: the original(原始) global kmem cache ;全局的缓存链表节点,双向循环链表可以从任意一个结点* 到任意的地方
* @gfp: allocation flags. 分配的标识
*
* All memory allocated from a per-memcg cache is charged to the owner memcg.
*/
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (!memcg_kmem_enabled()) /*当有如下四种情况的时候直接返回*/
return cachep;
if (gfp & __GFP_NOFAIL)
return cachep;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return cachep;
if (unlikely(fatal_signal_pending(current)))
return cachep;
return __memcg_kmem_get_cache(cachep);
}
[/c]
如果其他的情况都不出线的情况下,我们进入这个return 函数
[c]
mm/memcontrol.c
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
* 返回 kmem_cache 结构 我们应当使用它来分配slab
* If the cache does not exist yet, if we are the first user of it,
* we either create it immediately(马上), if possible, or create it asynchronously(异步)
* in a workqueue.如果这个缓存没有没被创建,或者我们第一次使用它。满足其中一个条件我们就开始创建一个这样的缓 * 存,如果可能,或者异步的创建它在工作队列中。
* In the latter(后者) case, we will let the current allocation go through with
* the original cache.在后一种情况下,我们允许通过使用最初的缓存分配
*
* Can't be called in interrupt context or from kernel threads.不能在中断上下文中或者内核线程中* * 调用这个函数。
* This function needs to be called with rcu_read_lock() held.这个函数需要调用 rcu_read_lock( * )RCU读锁是一种在2.6新加入的锁,主要面对大量读取却很少写的情况
*/
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
|||||看这个数据结构:看下边
struct kmem_cache *memcg_cachep;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
/*BUG 检查*/
if (current->memcg_kmem_skip_account)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm); /*从当前进程得到memcg*/
kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
/*又是一个魔性的宏:
*#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x));
*ACCESS_ONCE就是对这个变量取一次值,他采用了volatile,就使得所有访问该变量时都会从变量的 *地址中重新获取,而不会用缓存的值。但对于底层CPU来说,这个宏不起任何作用。主要是为了协调进程 *级别的代码和IRQ中断代码间的变量值的一致性。
*/
if (kmemcg_id < 0)
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);/*这个函数看下边*/
if (likely(memcg_cachep))
return memcg_cachep;
/*
* If we are in a safe context (can wait, and not in interrupt
* context), we could be be predictable and return right away.
* This would guarantee that the allocation being performed
* already belongs in the new cache.
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
*/
如果我们在一个安全的中断上下文中(可以写入,且不在一个中断上下文中)我们可以预测正确的返回,这样就可以保证分配了一个新的缓存。
memcg_schedule_kmem_cache_create(memcg, cachep);/*这个函数看下边*/
out:
css_put(&memcg->css); 修改状态,结束
return cachep;
}
[/c]
mem_cgroup 数据结构
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
这是一个控制结构体,这个内存控制结构体包含所有的页信息与缓存信息。我们最终想要提供静态的信息帮助管理员确定调整怎样的值。
备忘录:添加一个界限标志到内存控制结构中,当我们超过界限的时候就开始回收内存。可能添加一个低线,即使这样也没有发生回收在cgroup 的标记界线上,这是一种特征实现在未来。
struct mem_cgroup
cache_from_memcg_idex(struct kmem_cache *s,int idx)
[c]
/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
* That said the caller must assure the memcg's cache won't go away by either
* taking a css reference to the owner cgroup, or holding the slab_mutex.
*/
注意:我们使用RCU锁值是保护memcg_caches 这个数组,而不是per-memcg (高速缓存)这就是说调用者必须保证memcg 的缓存不会离开被提到所有者的cgroup 中去,(cgroup 这个东西下次另说),或者能保证持有slab_mutex 高速缓存全局锁。
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
struct kmem_cache *cachep;
struct memcg_cache_array *arr;
rcu_read_lock(); /*上RCU 读锁*/
arr = rcu_dereference(s->memcg_params.memcg_caches); /*获得全局内存控制结构数组*/
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
* memcg_create_kmem_cache()).
*/
cachep = lockless_dereference(arr->entries[idx]);
rcu_read_unlock();
return cachep; /*获取一个缓存节点*/
}
[/c]
memcg_schedule_kmem_cache_create( )
[c]
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
* in __memcg_schedule_kmem_cache_create will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
* trigger an allocation. This too, will make us recurse. Because at
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
current->memcg_kmem_skip_account = 1;
__memcg_schedule_kmem_cache_create(memcg, cachep); /*看下边*/
current->memcg_kmem_skip_account = 0;
}
/*
* Enqueue the creation of a per-memcg kmem_cache. 将创建的缓存入队
*/
static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
/*这个结构体:struct memcg_kmem_cache_create_work{
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work ;
}*/
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (!cw)
return;
css_get(&memcg->css);
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
schedule_work(&cw->work); /*将工作添加进全局的链表上*/
}
[/c]
static __always_inline void * __do_cache_alloc(struct kmem_cache,gfp_t flags)
这才是我们分配对象的重量级函数
[c]
static __always_inline void *
__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
if (current->mempolicy || cpuset_do_slab_mem_spread()) {/* NUMA选项相关*/
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
/*以上这一部分都是对于NUMA架构设计的函数,如果不是就NUMA架构,就会返回NULL */
}
objp = ____cache_alloc(cache, flags); /*我们分配的主力函数,这个函数一定要认真分析看下边*/
/*
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (!objp)
objp = ____cache_alloc_node(cache, flags, numa_mem_id());
out:
return objp;
}
[/c]
<mm/slab.c> /*我们分配空间的主力函数*/
static inline void * ____cache_alloc(struct kmem_cache *cachep,gfp_t flags)
[c]
/*首先说明这个不是NUMA架构下的函数*/
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp; /*返回的目标指针*/
struct array_cache *ac; /*per-cpu 缓存结构体的控制结构*/
bool force_refill = false; /*如果没有空闲空间则需要重新分配,这就是是否需要重新分配的标志,因为现在还不知道具体情况则暂时设定为否定的*/
check_irq_off();
/*检查确定在中断上下文中*/
ac = cpu_cache_get(cachep); /*获得CPU的高速缓存,实例化ac就是per-cpu.这个函数是使用了多个宏,最终通过汇编实现*/
if (likely(ac->avail)) { /*avail 是现在可以使用的空闲内存块的各数,并且使用了likely 优化了代码*/
ac->touched = 1; /*在需要取处一个缓存的时候,会将touched 置1*/
objp = ac_get_obj(cachep, ac, flags, false);
/*通过这个函数分配空间 objp = ac->entry[--ac->avail];从per-cpu 分配对象*/
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
/*但是也可能因为标记的原因,不能分配成功*/
if (objp) { /*如果成功就分配并且结束*/
STATS_INC_ALLOCHIT(cachep); /*初始化分配*/
goto out;
}
force_refill = true; /*没有分配到就需要重新创建一个slab*/
}
STATS_INC_ALLOCMISS(cachep); /*初始化失败,我们开始准备重新填充per-cpu*/
objp = cache_alloc_refill(cachep, flags, force_refill);/*看下边*/
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
[/c]
static void *cache_alloc_refill(struct kmem_cache *cachep,gfp_t flags,bool force_refill)
/*重新分配slab 对象填充per-cpu
[c]
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount; /*这里用来是没有使用的块的个数*/
struct kmem_cache_node *n;/*分配节点指针*/
struct array_cache *ac; /*per-cpu缓存*/
int node; /*NUMA 节点,这里我们暂时不考虑*/
check_irq_off(); /*确定在中断上下文里*/
node = numa_mem_id(); /*获得NUMA 缓存节点,这个架构高速缓存与CPU的数量一一对应*/
if (unlikely(force_refill)) /*增长缓存,还是期望这个表达式为假*/
goto force_grow;
retry: /*已经填充失败*/
ac = cpu_cache_get(cachep);/*获得per-cpu缓存*/
batchcount = ac->batchcount; /*获取可用节点个数*/
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
/*如果这个缓存还有一些空闲的节点则填充一半,否则全部填充*/
batchcount = BATCHREFILL_LIMIT;
}
n = get_node(cachep, node);/*获得slab头部*/
BUG_ON(ac->avail > 0 || !n);
spin_lock(&n->list_lock); /*获得这个slab的自旋锁*/
/* See if we can refill from the shared array */
/* 如果能从共享的数组转移就转移共享的节点过来*/
if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
n->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) { /*增长的数目大于0*/
struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
/*开始扫描所有的半满空闲链表*/
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
n->free_touched = 1;
entry = n->slabs_free.next;
if (entry == &n->slabs_free)
goto must_grow;
/*这里使用双向循环链表,所以会很快分辨出链表是否已经满了,如果满了就需要增长了*/
}
page = list_entry(entry, struct page, lru);
/*获得这个页的内存*/
check_spinlock_acquired(cachep);
/*检查获取自旋锁*/
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(page->active >= cachep->num);
while (page->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
/*修改状态*/
ac_put_obj(cachep, ac, slab_get_obj(cachep, page,node));
}
/* move slabp to correct slabp list: */ /*将slab放置到正确的地方*/
list_del(&page->lru);
if (page->active == cachep->num)
list_add(&page->lru, &n->slabs_full);
else
list_add(&page->lru, &n->slabs_partial);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow: /*缓存的增长*/
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/*这个函数看下边有祥解*/
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep); /*获得这个CPU 的高速缓存*/
node = numa_mem_id(); /*NUMA 架构选项*/
/* no objects in sight? abort */ /*没有可用对象中断之*/
if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;/*如果没有获得空闲对象*/
}
ac->touched = 1;
return ac_get_obj(cachep, ac, flags, force_refill);
}
[/c]
<mm/slab.c>
static int cache_grow(struct kmem_cache *cachep,gfp_t flags,int nodeid,struct page *page)
[c]
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
/*增长cache 的缓存,当没有可以使用的对象的时候明就调用kmem_cache_alloc()重新创建一个缓存*/
static int cache_grow(struct kmem_cache *cachep,
gfp_t flags, int nodeid, struct page *page)
{
void *freelist;
size_t offset;
gfp_t local_flags;
struct kmem_cache_node *n;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
/*惰性的只检查有效的标识在这里,保证来自分配函数kmem_cache_alloc()*/
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
BUG();
}
/*检测BUG*/
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/*标记约束的以及可以回收的*/
/* Take the node list lock to change the colour_next on this node */
/*获得节点链表锁并且改变颜色*/
check_irq_off();/*关闭中断,免打扰*/
n = get_node(cachep, nodeid); /*获得cache 节点*/
spin_lock(&n->list_lock); /*获得自旋锁*/
/* Get colour for the slab, and cal the next value. */
/* 给slab 上色,并且指向下一个节点*/
offset = n->colour_next;
n->colour_next++;
if (n->colour_next >= cachep->colour)
n->colour_next = 0;
spin_unlock(&n->list_lock); /*着色完毕后,解除自旋锁*/
offset *= cachep->colour_off; /*获得偏移量*/
if (local_flags & __GFP_WAIT) /*如果发现可以等待就激活中断触发*/
local_irq_enable();
/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);/*DMA 相关*/
/*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
/*为对象获取物理页内存空间,请求一个物理页*/
if (!page)
page = kmem_getpages(cachep, local_flags, nodeid);
if (!page)/*获得失败就结束*/
goto failed;
/* Get slab management. */
/*获得slab 控制结构*/
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid); /*我们在这里为slab管理单元分配空间*/
if (!freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);/*给页上标记*/
cache_init_objs(cachep, page); /*初始化对象*/
if (local_flags & __GFP_WAIT) /*可以等待,则激活中断响应*/
local_irq_disable();
check_irq_off(); /*中断关闭*/
spin_lock(&n->list_lock); /*上自旋锁*/
/* Make slab active. */
list_add_tail(&page->lru, &(n->slabs_free)); /*插入节点*/
STATS_INC_GROWN(cachep); /*增长标记++*/
n->free_objects += cachep->num; /*增加空闲对象计数*/
spin_unlock(&n->list_lock); /*解开自旋锁*/
return 1;
opps1:
kmem_freepages(cachep, page);/*释放页*/
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
}
[/c]