mem_map
是一个全局变量, 指向一个struct page数组, 管理着系统中的所有物理页面, 数组中的每个page结构,对应一个物理页框.
mem_map仅当系统为单node时有效, 对于arm平台, 只有一个node
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
NODE_DATA(0)->node_mem_map
系统中的每个内存node的node_mem_map成员都指向一个struct page数组, 用来描述这个node所有zone的物理内存页框
node_mem_map在alloc_node_mem_map中分配
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, start, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
}
从上面代码的注释我们可以得到如下信息:
1. zone是不需要按照MAX_ORDER对齐的
2. 但是memmap分配时, 必须按照MAX_ORDER对齐, 目的是buddy 分配能够正常工作
3. mem_map是由memblock分配的,因此起始位置是动态分配的, 大小计算: 对齐区域页面数*36bytes, (36bytes为page结构大小)
meminfo
meminfo保存系统启动阶段的内存配置, 会被启动时的初始化函数使用.
struct meminfo {
int nr_banks;
struct membank bank[NR_BANKS];
};
/*
* This keeps memory configuration data used by a couple memory
* initialization functions, as well as show_mem() for the skipping
* of holes in the memory map. It is populated by arm_add_memory().
*/
struct meminfo meminfo;
uboot需要传入内存bank配置参数, 系统在初始化阶段根据这些参数填充meminfo, 有两种途径传入系统内存配置
- 通过device tree的memory 节点
- 传统的command line参数: mem=size@start
这里仅描述device tree方式传入内存layout
early_init_dt_scan_memory会解析device_tree中的memory节点,
->early_init_dt_add_memory_arch
->arm_add_memory(base, size); 填充一个bank, base和size会页对齐, 对于超出4G范围的部分会被截掉
.
meminfo修改
meminfo通过device tree填充后, 并不是一成不变的, 系统会对重新调整meminfo中的bank
sanity_check_meminfo函数遍历每一个bank, 如果发现某个bank跨过了vmalloc_limit, 那么就要把这个bank分成连个bank
memblock
memblock是在kernel 2.6.35引入的, 目的是为了替代bootmem, 简化启动阶段内存管理代码. memblock代码并不是重新开发的, 而是使用已经存在的logical memory block(lmb). LMB早已在Microblaze, PowerPC, SuperH和SPARC架构上使用.
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
struct memblock_type {
unsigned long cnt; /* number of regions */
unsigned long max; /* size of the allocated array */
phys_addr_t total_size; /* size of all regions */
struct memblock_region *regions;
};
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};
extern struct memblock memblock;
memblock包含两个内存区队列: memory和reserved
memory表示memblock管理的内存区, 而reserved表示memblock预留的内存区(包括通过memblock分配的, 以及通过memblock_reserve接口预留的), 在reserved中描述的地址范围不可以再被用来分配.
swapper_pg_dir
totalram_pages
totalreserve_pages
zone
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long watermark[NR_WMARK];
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
/*
* This is a per-zone reserve of pages that should not be
* considered dirtyable memory.
*/
unsigned long dirty_balance_reserve;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't
* tolerant drift of present_pages should hold memory hotplug lock to
* get a stable value.
*
* Read access to managed_pages should be safe because it's unsigned
* long. Write access to zone->managed_pages and totalram_pages are
* protected by managed_page_count_lock at runtime. Idealy only
* adjust_managed_page_count() should be used instead of directly
* touching zone->managed_pages and totalram_pages.
*/
unsigned long managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
const char *name;
/*
* Number of MIGRATE_RESEVE page block. To maintain for just
* optimization. Protected by zone->lock.
*/
int nr_migrate_reserve_block;
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
/*
* wait_table -- the array holding the hash table
* wait_table_hash_nr_entries -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t *wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
ZONE_PADDING(_pad1_)
/* Write-intensive fields used from the page allocator */
spinlock_t lock;
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Write-intensive fields used by page reclaim */
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct lruvec lruvec;
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
spanned_pages: 是zone start_pfn到end_pfn跨越的物理页面总和, 包括了这之间的hole. spanned_pages = zone_end_pfn - zone_start_pfn
present_pages: 是zone中存在的物理页面. present_pages = spanned_pages - absent_pages(pages in holes)
managed_pages: 是被buddy系统管理的present_pages, managed_pages = present_pages - reserved_pages.
计算managed_pages
mem_init->free_all_bootmem
free_all_bootmem在有两个实现, 分别在nobootmem.c和bootmem.c中, 缺省情况下内核使能CONFIG_NO_BOOTMEM, 所以会调用nobootmem.c中的实现.
unsigned long __init free_all_bootmem(void)
{
unsigned long pages;
reset_all_zones_managed_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
pages = free_low_memory_core_early();
totalram_pages += pages;
return pages;
}
reset_all_zones_managed_pages会对所有zone的managed_pages清0
free_low_memory_core_early则会重新计算各个zone的managed_pages(只计算低端内存)
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
phys_addr_t start, end;
u64 i;
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
count += __free_memory_core(start, end);
}
return count;
}
计算memblock中描述的所有空闲内存区, 空闲内存区的计算方式是从memory region扣去reserved region.