内存是计算机的重要的硬件资源。但如果没有软件的管理,再多再快的内存也只是摆设。在管理内存之初是对内存的初始化。本文结合内核源码(5.13)和一些参考资料谈谈我对内存初始化的理解。
先来看看内存有关的硬件,这是一块计算机主板,图片来自网络:

在cpu内存区域是两个numa节点,一个numa节点可以理解为一个cpu带几个内存条。了解这个对理解Linux内存初始化和管理很重要。从物理配置上可以大约感觉到内存对于cpu不是对称的,每个cpu对属于本numa节点的内存是对称的,但是对另一个numa所属的内存访问时间会慢一些。这些底层硬件的限制会影响到内核内存管理。
现在进入内核。管理资源从哪里开始呢。如果你是一个工厂的管理者,你刚有一批原料运进来,你首先想到的是啥。肯定是统计一下来料的种类数量,然后登记在册,之后才能进行管理。内核对内存的管理也是一样,首先得知道有多少内存,物理地址是怎么分布的。
获取设备的基本信息,各个架构都有自己的一套办法。对于arm64来讲,设备树device tree是必须的。这里存放了基本的设备信息,包括内存。
start_kernel->setup_arch->setup_machine_fdt->early_init_dt_scan_nodes-> of_scan_flat_dt(early_init_dt_scan_memory, NULL)->early_init_dt_scan_memory
/*
* early_init_dt_scan_memory - Look for and parse memory nodes
*/
int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
int depth, void *data)
{
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
const __be32 *reg, *endp;
int l;
bool hotpluggable;
/* We are scanning "memory" nodes only */
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
// 通过查询dtb得到内存信息
reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
if (reg == NULL)
reg = of_get_flat_dt_prop(node, "reg", &l);
if (reg == NULL)
return 0;
...
// 将得到的memory base和大小信息放到memoryblock.memory中
early_init_dt_add_memory_arch(base, size);
...
}
void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
{
const u64 phys_offset = MIN_MEMBLOCK_ADDR;
if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
pr_warn("Ignoring memory block 0x%llx - 0x%llx\n",
base, base + size);
return;
}
....
// 在做一些检查之后,memory的base和size被添加到memblock中
memblock_add(base, size);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
//将由base,base+size构成的第一个memory region添加到memblock中,作为第一个memory region
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}
static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
int idx, nr_new;
struct memblock_region *rgn;
if (!size)
return 0;
/* special case for empty array */
// 因为这个时候memblock里面还没有添加region,第一个region的size是0,于是从dtb中得到的memory 信息最终存储在memblock.memory.region[0]中
if (type->regions[0].size == 0) {
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
}
....
}
由源码得知,memblock.memory.region存储了memory的信息,在memory初始化的初期,memory的基本信息就是来源与这里。
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
return memblock.memory.regions[0].base;
}
这个memblock_start_of_DRAM就是获取memory基地址的api了,它只是将memblock的信息return回去。
上面讲了内存基地址和大小是怎么获取的,接下来我们从内核源码的角度看看内存初始化的流程。这里会引入几个重要的内存数据结构,为理解这些,我们先来建立Linux内存的层次结构。
文章一开始就晒了cpu和memory的硬件图。引出NUMA(Non-uniform memory access)非一致性内存访问的概念。图中的一个cpu可以认为是一个numa,上图中由两个numa。numa在Linux内核中被称为node(节点),相关的数据结构是pglist_data。每个node内部又分成好几个zone(区),每个区下面就是内存管理的最小单元page(页)。下图基本展示了这一层次。

-------- 上图来源于《深入Linux内核架构》
因为这仨数据结构是内存管理的基础数据,下面来看一下这三个数据结构的里面包含了些啥。
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
// 当前node所包含的所有zone都放在这个zone链表中
struct zone node_zones[MAX_NR_ZONES];
/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
// 系统所有node包含的所有的zone组成一个链表,如果当前node的内存不足就会从邻近的node
// 寻找可用的内存,这个链表的顺序是按照跟当前node访问成本由低到高排列的。
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
* Also synchronizes pgdat->first_deferred_pfn during deferred page
* init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* node reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
*
* Use mem_cgroup_lruvec() to look up lruvecs.
*/
struct lruvec __lruvec;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
可以看到pg_data_t包含了zone和zone_lists。
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
// 冷热页相关的结构,越靠前越热,后面的逐渐变冷
struct per_cpu_pageset __percpu *pageset;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
//跟伙伴系统相关的结构,order代表2的指数
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
* compact_order_failed is the minimum compaction failed order.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
zone中直接管理页的成员有:pageset,跟冷热页相关;free_area,跟伙伴系统相关。pageset可以用来分配单页。此外于zone相关的重要概念是zone_type
enum zone_type {
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
一般系统必须有的zone是DMA 和ZONE_NORMAL。high_memory在32位系统存在,64位系统没有。ZONE_DEVICE暂时可以忽略,nvdimm需要这个。
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @dma_addr: might require a 64-bit value even on
* 32-bit architectures.
*/
dma_addr_t dma_addr;
};
struct { /* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
/* First tail page only */
unsigned char compound_dtor;
unsigned char compound_order;
atomic_t compound_mapcount;
unsigned int compound_nr; /* 1 << compound_order */
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
atomic_t hpage_pinned_refcount;
/* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
unsigned long _pt_pad_2; /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};
union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;
/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
unsigned int active; /* SLAB */
int units; /* SLOB */
};
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;
page代表了系统能够管理的最小单元。所有的物理内存都是一页一页组织起来的,每个页面都要分配一个页面作为”户口“,所以page struct要尽可能小。
内存的初始化就是这些管理单元是如何初始化的。
内核启动之初先是汇编代码,c的运行环境ready之后就跳到c的入口函数start_kernel。
//只保留几个跟memory初始化相关的函数
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
...
setup_arch(&command_line);
...
build_all_zonelists(NULL);
page_alloc_init();
...
mm_init();
...
}
setup_arch里面是跟架构相关的内存初始化。
void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
init_mm.start_code = (unsigned long) _stext;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = (unsigned long) _end;
*cmdline_p = boot_command_line;
/*
* If know now we are going to need KPTI then use non-global
* mappings from the start, avoiding the cost of rewriting
* everything later.
*/
arm64_use_ng_mappings = kaslr_requires_kpti();
early_fixmap_init();
early_ioremap_init();
//从fdt中获取内存基地址和大小
setup_machine_fdt(__fdt_pointer);
...
//初始化memblock,这是伙伴系统之前内核使用的内存管理器
arm64_memblock_init();
// 分页相关
paging_init();
...
//初始化zone,page
bootmem_init();
...
}
memblock预留了内核image、dtb的内存。其他可用的内存部分会还给伙伴系统。这就是在dmesg中经常看到的:
[ 6.997734] Freeing unused decrypted memory: 2040K
[ 6.998029] Freeing unused kernel image (initmem) memory: 2496K
[ 7.040496] Write protecting the kernel read-only data: 22528k
[ 7.040836] Freeing unused kernel image (text/rodata gap) memory: 2040K
[ 7.040977] Freeing unused kernel image (rodata/data gap) memory: 1072K
似乎kernel image的内存也还回去了?以后再仔细瞧瞧。
void __init paging_init(void)
{
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
map_kernel(pgdp);
map_mem(pgdp);
pgd_clear_fixmap();
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
init_mm.pgd = swapper_pg_dir;
memblock_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
memblock_allow_resize();
}
建立页表,给pgd赋值成swapper_pg_dir。mmu早就打开了。
void __init bootmem_init(void)
{
unsigned long min, max;
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());
early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
max_pfn = max_low_pfn = max;
min_low_pfn = min;
//初始化numa node
arm64_numa_init();
/*
* must be done after arm64_numa_init() which calls numa_init() to
* initialize node_online_map that gets used in hugetlb_cma_reserve()
* while allocating required CMA size across online nodes.
*/
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
arm64_hugetlb_cma_reserve();
#endif
dma_pernuma_cma_reserve();
/*
* sparse_init() tries to allocate memory from memblock, so must be
* done after the fixed reservations
*/
sparse_init();
// 初始化区
zone_sizes_init(min, max);
/*
* Reserve the CMA area after arm64_dma_phys_limit was initialised.
*/
dma_contiguous_reserve(arm64_dma_phys_limit);
/*
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
reserve_crashkernel();
memblock_dump_all();
}
主要是初始化node和zone。
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
unsigned int __maybe_unused acpi_zone_dma_bits;
unsigned int __maybe_unused dt_zone_dma_bits;
phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
#ifdef CONFIG_ZONE_DMA
acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
free_area_init(max_zone_pfns);
}
这里先对DMA,DMA32(如果有)NORMAL区确定边界,然后再初始化。
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
// 保存每个zone的上下界
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/*
* Print out the early node map, and initialize the
* subsection-map relative to active online memory ranges to
* enable future "sub-section" extensions of the memory map.
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
代码看起来很长,主要是把入参 max_zone_pfn转换成[low, max]的形式然后对每个node调用free_area_init_node。
static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
// 找到每个node的pfn上下边界
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pgdat->node_id = nid;
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
// 会把node0的node_mem_map传给mem_map
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
// 初始化zone
free_area_init_core(pgdat);
}
初始化就是给相关的数据结构成员赋值。上面可以看到node节点的成员:node_id,node_start_pfn被初始化了。
zone的初始化由free_area_init_core完成。
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
init_currently_empty_zone初始化free_area。
void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int zone_idx = zone_idx(zone) + 1;
if (zone_idx > pgdat->nr_zones)
pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
pgdat->node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));
zone_init_free_lists(zone);
zone->initialized = 1;
}
zone_init_free_lists 初始化free_area,这个是伙伴系统需要的数据结构。
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
这里将free_area对应的内存数量都初始化为0.显然这不是最终的情形。
void __meminit __weak memmap_init(unsigned long size, int nid,
unsigned long zone,
unsigned long range_start_pfn)
{
unsigned long start_pfn, end_pfn;
unsigned long range_end_pfn = range_start_pfn + size;
int i;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
memmap_init将所有的zone初始化为ZONE_MOVEABLE.
现在setup_arch部分处理完了,主要是把node和zone初始化了。但是node中还有一个node_zonelists没有初始化,这部分在build_all_zonelists中。
node_zonelists是存放所有节点上的zone的一个链表,当本节点的内存不足时就通过查找这个表查找其他节点的内存。先看看node_zonelists是个什么结构。
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
MAX_ZONELISTS
};
//可能由1~2个zonelist
struct zonelist node_zonelists[MAX_ZONELISTS];
// 每个zonelist 可以容纳所有可能的zone
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
可以看到node_zonelists是个二维数组,内层子数组可以容纳系统所有的zone。初始化的任务就是怎么组织所有的zone到这个数组中。
设0,1,2分别代表ZONE_DMA,ZONE_DMA32, ZONE_NORMAL,假设有3个node,0~2,当前处于nodeA,最终的情形可能是:B2,B1,B0,C2,C1,C0,A2,A1,A0。node顺序是从大于本node开始顺序,zone的顺序是按照从大到小,因为DMA的内存要预留给设备用,分配时尽量不用,除非normal内存耗尽。
void __ref build_all_zonelists(pg_data_t *pgdat)
{
unsigned long vm_total_pages;
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
//这里是关键
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
/* Get the number of free pages beyond high watermark in all zones. */
vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
...
}
最终实现初始化zonelists的函数是build_zonelists。
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
struct zoneref *zonerefs;
int nr_zones;
local_node = pgdat->node_id;
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
nr_zones = build_zonerefs_node(pgdat, zonerefs);
zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}
至此内存管理最基本的node,zone已经初始化完了。

959

被折叠的 条评论
为什么被折叠?



