23.12.5.-7
buddy是内核启动后的物理内存分配器,是其它内存分配的物理核心。它的设计思想是将所有的也按照1,2,4,……,1024个连续的页框进行管理。也就是说Buddy系统分配的物理内存最大的连续范围为:
1024 × 4 K B = 4 M B 1024\times4KB=4MB 1024×4KB=4MB
每种大小有一个单独的链表,也就是说,所谓的buddy系统实际上就是按照某种规则来维护这11个( 2 0 2^0 20~ 2 10 2^{10} 210)链表。每个链表上都串了很多个对应大小的内存块,内存块使用struct free_area进行表示。每个zone都有一个长度为11的free_area数组,即每个zone都相当于有自己的buddy分配器。
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
...
struct zone {
...
struct free_area free_area[MAX_ORDER];
...
};
//include/linux/mmzone.h
struct free_area的具体定义为:
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};
//include/linux/mmzone.h
order通过zone中free_area的index进行表示,这个order也表示这内存块的大小为 2 o r d e r 2^{order} 2order,可以看到每个大小的内存块链表里面实际包含了MIGRATE_TYPES个链表。这是一个锦上添花的操作,大概的作用就是把空闲的块按照能否移动的性质分成几类,再以后出现大量外碎片的时候,可以做些移动操作来减小。
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
* ZONE_MOVABLE works. Only movable pages can be allocated
* from MIGRATE_CMA pageblocks and page allocator never
* implicitly change migration type of MIGRATE_CMA pageblock.
*
* The way to use it is to change migratetype of a range of
* pageblocks to MIGRATE_CMA which can be done by
* __free_pageblock_cma() function.
*/
MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
MIGRATE_TYPES
};
//include/linux/mmzone.h
向buddy分配器进行内存申请时,首先从这个区上的free_area中找到一个至少满足申请大小的有空间块的order。然后从这个order对应的链表中摘一块内存块,再按照经典的buddy原则判断是否需要切分。如果需要切分则将切分下来的buddy挂到order-1阶的链表上去。
buddy原则即:
-
如果这个块的一半即可满足要求,则将这个块切成两块,互称为buddy,一块分配出去,一块挂到order-1阶链表上。显然互为buddy的两个块是物理连续的。
-
如果超过块的一半才能满足要求,则将整个块都分配出去。
buddy初始化时,需要按照 2 × 2 o r d e r × 4 K B 2\times2^{order}\times4KB 2×2order×4KB对齐。也就是任何一个组buddy,buddy[0]和buddy[1]是唯一确定的。
buddy分配器的真实接口是 rm_queue,该接口实际调用 rmqueue_buddy. 这个函数不断地尝试从zone中寻找符合order要求的page,申请的核心是__rmqueue_smallest和__rmqueue.
static __always_inline
struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
unsigned int order, unsigned int alloc_flags,
int migratetype)
{
struct page *page;
unsigned long flags;
do {
page = NULL;
spin_lock_irqsave(&zone->lock, flags);
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
/*
* If the allocation fails, allow OOM handling access
* to HIGHATOMIC reserves as failing now is worse than
* failing a high-order atomic allocation in the
* future.
*/
if (!page && (alloc_flags & ALLOC_OOM))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
spin_unlock_irqrestore(&zone->lock, flags);
return NULL;
}
}
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);
} while (check_new_pages(page, order));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
return page;
}
//mm/page_alloc.c
__rmqueue_smallest对当前zone中的freelists,按照order向上依次尝试。如果能够从当前order的链表中摘取第一个非空内存块,则找到了合适的内存块。
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
return page;
}
return NULL;
}
//mm/page_alloc.c
摘取的函数get_page_from_free_area具体实现为
static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
}
//include/linux/mmzone.h
找到合适的内存块之后,将其从对应的order链表中删除。注意,这里返回的page不是一个页,而只是首页,其后的order-1个页都属于这次分配。将内存块从链表移除后,使用expand函数判断是否需要并且执行buddy分割。
expand函数中的low是申请的内存的order数,high是当前分配的block的order数。也就是将后续的 2 h i g h − l o w 2^{high-low} 2high−low个page重新找到合适的order链表中。
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}
//mm/page_alloc.c
这个过程中可能产生Buddy,也可能阐述更小的order. 例如申请一个3阶(也就是8个连续的page)的内存块,但是当前空闲的最小的内存块的阶数为5阶(即32个连续的page),那么expand会将多余的内存区域分成:
- 一个4阶(16个连续的page)的内存块
- 一个3阶(8个跨不需要的page)内存块
剩余的那一个3阶的内存块就是要返回的内存块了。
__rmqueue()的底层实现也是__rmqueue_smallest(),因此不再多说。
通过封装,最终使用Buddy系统的接口
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)