5.8 初始化内存管理
回到start_kernel,下一个函数执行mm_init()。这个函数很重要了,来自同一个文件。
static void __init mm_init(void) { /* * page_cgroup requires countinous pages as memmap * and it's bigger than MAX_ORDER unless SPARSEMEM. */ page_cgroup_init_flatmem(); mem_init(); kmem_cache_init(); pgtable_cache_init(); vmalloc_init(); } |
这五个函数,其中由于我们没有配置CONFIG_CGROUP_MEM_RES_CTLR,所以第一个函数page_cgroup_init_flatmem是个空函数。其余几个函数各个都是重点。
该函数执行完后不能再用像alloc_bootmem()、alloc_bootmem_low()、alloc_bootmem_pages()等申请低端内存的函数来申请内存,也就不能申请大块的连续物理内存了。
5.8.1 启用伙伴算法
首先是mem_init,来自arch/x86/mm/init_32.c:
867void __init mem_init(void) 868{ 869 int codesize, reservedpages, datasize, initsize; 870 int tmp; 871 872 pci_iommu_alloc(); 873 874#ifdef CONFIG_FLATMEM 875 BUG_ON(!mem_map); 876#endif 877 /* this will put all low memory onto the freelists */ 878 totalram_pages += free_all_bootmem(); 879 880 reservedpages = 0; 881 for (tmp = 0; tmp < max_low_pfn; tmp++) 882 /* 883 * Only count reserved RAM pages: 884 */ 885 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 886 reservedpages++; 887 888 set_highmem_pages_init(); 889 890 codesize = (unsigned long) &_etext - (unsigned long) &_text; 891 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 892 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 893 894 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " 895 "%dk reserved, %dk data, %dk init, %ldk highmem)/n", 896 nr_free_pages() << (PAGE_SHIFT-10), 897 num_physpages << (PAGE_SHIFT-10), 898 codesize >> 10, 899 reservedpages << (PAGE_SHIFT-10), 900 datasize >> 10, 901 initsize >> 10, 902 totalhigh_pages << (PAGE_SHIFT-10)); 903 904 printk(KERN_INFO "virtual kernel memory layout:/n" 905 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)/n" 906#ifdef CONFIG_HIGHMEM 907 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)/n" 908#endif 909 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)/n" 910 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)/n" 911 " .init : 0x%08lx - 0x%08lx (%4ld kB)/n" 912 " .data : 0x%08lx - 0x%08lx (%4ld kB)/n" 913 " .text : 0x%08lx - 0x%08lx (%4ld kB)/n", 914 FIXADDR_START, FIXADDR_TOP, 915 (FIXADDR_TOP - FIXADDR_START) >> 10, 916 917#ifdef CONFIG_HIGHMEM 918 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 919 (LAST_PKMAP*PAGE_SIZE) >> 10, 920#endif 921 922 VMALLOC_START, VMALLOC_END, 923 (VMALLOC_END - VMALLOC_START) >> 20, 924 925 (unsigned long)__va(0), (unsigned long)high_memory, 926 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, 927 928 (unsigned long)&__init_begin, (unsigned long)&__init_end, 929 ((unsigned long)&__init_end - 930 (unsigned long)&__init_begin) >> 10, 931 932 (unsigned long)&_etext, (unsigned long)&_edata, 933 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, 934 935 (unsigned long)&_text, (unsigned long)&_etext, 936 ((unsigned long)&_etext - (unsigned long)&_text) >> 10); 937 938 /* 939 * Check boundaries twice: Some fundamental inconsistencies can 940 * be detected at build time already. 941 */ 942#define __FIXADDR_TOP (-PAGE_SIZE) 943#ifdef CONFIG_HIGHMEM 944 BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 945 BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); 946#endif 947#define high_memory (-128UL << 20) 948 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); 949#undef high_memory 950#undef __FIXADDR_TOP 951 952#ifdef CONFIG_HIGHMEM 953 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); 954 BUG_ON(VMALLOC_END > PKMAP_BASE); 955#endif 956 BUG_ON(VMALLOC_START >= VMALLOC_END); 957 BUG_ON((unsigned long)high_memory > VMALLOC_START); 958 959 if (boot_cpu_data.wp_works_ok < 0) 960 test_wp_bit(); 961 962 save_pg_dir(); 963 zap_low_mappings(true); 964} |
872行,Intel IOMMU架构在Linux上的初始化函数pci_iommu_alloc。这个函数不是我们关注的重点,我们就不深入下去了,这里仅仅粗略地介绍一下。该函数首先通过读取 DMA Remapping table,来判断判断是否支持DMAR设备。随后调用pci_swiotlb_init函数对其进行初始化,解析DMAR table,并逐一打印每个dmar项。最后设置全局变量dma_ops,把初始化后的swiotlb_dma_ops传递给它,后者定义了IOMMU架构中所有的swiotlb方法。对IOMMU感兴趣的同学可以去查阅相关资料,这里就不详细介绍了。
878行,totalram_pages这个全局变量我们第一次遇见。它编译的时候初始化为0,现在它就等于free_all_bootmem函数的返回值,该函数在mm/bootmem.c中定义:
unsigned long __init free_all_bootmem(void) { #ifdef CONFIG_NO_BOOTMEM /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesnt have RAM installed * low ram will be on Node1 * Use MAX_NUMNODES will make sure all ranges in early_node_map[] * will be used instead of only Node0 related */ return free_all_memory_core_early(MAX_NUMNODES); #else unsigned long total_pages = 0; bootmem_data_t *bdata;
list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata);
return total_pages; #endif } |
我们看到,由于CONFIG_NO_BOOTMEM起作用,并且MAX_NUMNODES为1,所以函数直接调用free_all_memory_core_early(1),怎么样,前面说得没错吧,终于碰到了这个函数:
200unsigned long __init free_all_memory_core_early(int nodeid) 201{ 202 int i; 203 u64 start, end; 204 unsigned long count = 0; 205 struct range *range = NULL; 206 int nr_range; 207 208 nr_range = get_free_all_memory_range(&range, nodeid); 209 210 for (i = 0; i < nr_range; i++) { 211 start = range[i].start; 212 end = range[i].end; 213 count += end - start; 214 __free_pages_memory(start, end); 215 } 216 217 return count; 218} |
205行的那个range结构很简单:
struct range {
u64 start;
u64 end;
};
所以首先208行调用get_free_all_memory_range函数:
393int __init get_free_all_memory_range(struct range **rangep, int nodeid) 394{ 395 int i, count; 396 u64 start = 0, end; 397 u64 size; 398 u64 mem; 399 struct range *range; 400 int nr_range; 401 402 count = 0; 403 for (i = 0; i < max_early_res && early_res[i].end; i++) 404 count++; 405 406 count *= 2; 407 408 size = sizeof(struct range) * count; 409 end = get_max_mapped(); 410#ifdef MAX_DMA32_PFN 411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) 412 start = MAX_DMA32_PFN << PAGE_SHIFT; 413#endif 414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); 415 if (mem == -1ULL) 416 panic("can not find more space for range free"); 417 418 range = __va(mem); 419 /* use early_node_map[] and early_res to get range array at first */ 420 memset(range, 0, size); 421 nr_range = 0; 422 423 /* need to go over early_node_map to find out good range for node */ 424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid); 425#ifdef CONFIG_X86_32 426 subtract_range(range, count, max_low_pfn, -1ULL); 427#endif 428 subtract_early_res(range, count); 429 nr_range = clean_sort_range(range, count); 430 431 /* need to clear it ? */ 432 if (nodeid == MAX_NUMNODES) { 433 memset(&early_res[0], 0, 434 sizeof(struct early_res) * max_early_res); 435 early_res = NULL; 436 max_early_res = 0; 437 } 438 439 *rangep = range; 440 return nr_range; 441} |
403行,全局变量max_early_res和early_res[]数组,老熟人了,一个循环得到目前已经分配了early_res元素的个数,把它的值乘以2赋给size。409行,调用get_max_mapped函数:
u64 __init get_max_mapped(void)
{
u64 end = max_pfn_mapped;
end <<= PAGE_SHIFT;
return end;
}
该函数返回我们的老熟人,最后一个页框max_pfn_mapped对应的物理地址,赋值给内部变量end(start在396行被赋值为0)。然后414行,调用find_fw_memmap_area函数,传给他的参数是start、end、size和range结构的大小:
u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
{
return find_e820_area(start, end, size, align);
}
find_e820_area不用多说了吧,从e820.map[]数组中寻找到一块能够容纳size个字节的内存段,该内存段的首物理地址赋值给get_free_all_memory_range的内部变量mem。418~421行初始化这块区域。随后424行调用add_from_early_node_map函数:
int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { int i; u64 start, end;
/* need to go over early_node_map to find out good range for node */ for_each_active_range_index_in_nid(i, nid) { start = early_node_map[i].start_pfn; end = early_node_map[i].end_pfn; nr_range = add_range(range, az, nr_range, start, end); } return nr_range; }
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { if (start >= end) return nr_range;
/* Out of slots: */ if (nr_range >= az) return nr_range;
range[nr_range].start = start; range[nr_range].end = end;
nr_range++;
return nr_range; } |
执行完毕add_from_early_node_map函数之后,range执行的这块区域中,就形成了一个range[nr_range]数组,每个数组元素对应early_node_map[]的数组元素,表示nr_range块空闲内存空间的起始页框号和结束页框号。426行subtract_range函数检验一下这个range是否有问题,并进行调整。428行,调用subtract_early_res对产生冲突的地址进行调整:
static void __init subtract_early_res(struct range *range, int az) { int i, count; u64 final_start, final_end; int idx = 0;
count = 0; for (i = 0; i < max_early_res && early_res[i].end; i++) count++;
/* need to skip first one ?*/ if (early_res != early_res_x) idx = 1;
#define DEBUG_PRINT_EARLY_RES 1
#if DEBUG_PRINT_EARLY_RES printk(KERN_INFO "Subtract (%d early reservations)/n", count); #endif for (i = idx; i < count; i++) { struct early_res *r = &early_res[i]; #if DEBUG_PRINT_EARLY_RES printk(KERN_INFO " #%d [%010llx - %010llx] %15s/n", i, r->start, r->end, r->name); #endif final_start = PFN_DOWN(r->start); final_end = PFN_UP(r->end); if (final_start >= final_end) continue; subtract_range(range, az, final_start, final_end); }
} |
对early_res体系熟悉的同学对上述代码一定不会困惑,我们看到subtract_early_res对地址进行调整,去掉那些已经被占用了的地址空间。回到get_free_all_memory_range,最后两行,把range赋给结果参数rangep,并且返回最终的range数组的元素个数nr_range。
回到free_all_memory_core_early函数中,内部变量range有了,其元素个数nr_range也有了,那么210~215执行一个循环,将range数组的每一个元素调用__free_pages_memory进行释放:
174static void __init __free_pages_memory(unsigned long start, unsigned long end) 175{ 176 int i; 177 unsigned long start_aligned, end_aligned; 178 int order = ilog2(BITS_PER_LONG); 179 180 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 181 end_aligned = end & ~(BITS_PER_LONG - 1); 182 183 if (end_aligned <= start_aligned) { 184 for (i = start; i < end; i++) 185 __free_pages_bootmem(pfn_to_page(i), 0); 186 187 return; 188 } 189 190 for (i = start; i < start_aligned; i++) 191 __free_pages_bootmem(pfn_to_page(i), 0); 192 193 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) 194 __free_pages_bootmem(pfn_to_page(i), order); 195 196 for (i = end_aligned; i < end; i++) 197 __free_pages_bootmem(pfn_to_page(i), 0); 198} |
函数主要执行183~188行代码,通过__free_pages_bootmem函数释放对应号码的页框,从号码从start到end号。
下面来看看__free_pages_bootmem:
637void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 638{ 639 if (order == 0) { 640 __ClearPageReserved(page); 641 set_page_count(page, 0); 642 set_page_refcounted(page); 643 __free_page(page); 644 } else { 645 int loop; 646 647 prefetchw(page); 648 for (loop = 0; loop < BITS_PER_LONG; loop++) { 649 struct page *p = &page[loop]; 650 651 if (loop + 1 < BITS_PER_LONG) 652 prefetchw(p + 1); 653 __ClearPageReserved(p); 654 set_page_count(p, 0); 655 } 656 657 set_page_refcounted(page); 658 __free_pages(page, order); 659 } 660} |
我们传递进来的参数order为0,所以来到643行,针对这个页面page,著名的伙伴算法到来了,我们来看它的定义:
#define __free_page(page) __free_pages((page), 0)
释放页框的所有内核宏和函数都依赖于__free_pages()函数。它接收的参数为将要释放的第一个页框的页描述符的地址(page)和将要释放的一组连续页框的数量的对数(order)。该函数执行如下步骤:
1. 检查第一个页框是否真正属于动态内存(它的PG_reserved 标志被清0);如果不是,则终止。
2. 减少page->_count 使用计数器的值;如果它仍然大于或等于0,则终止。
3. 如果order 等于0,那么该函数调用free_hot_page()来释放页框给适当内存管理区的每CPU 热高速缓存。
4. 如果order大于0,那么它将页框加入到本地链表中,并调用free_pages_bulk()函数把它们释放到适当内存管理区的伙伴系统中。
我们这里order为0,所以调用free_hot_page(),最终会调用__free_one_page。由于前面的pglist和zone的体系已经建立好,该函数对当前页面page对应的那个zone的free_area数组进行处理。由于这个地方是第一次触及该数组,那么这一次free_hot_page调用的__free_one_page将会找到全部伙伴,等于是初始化了整个伙伴算法系统。好了,怀疑我这句话的同志可以去看看博客“伙伴系统算法”
http://blog.youkuaiyun.com/yunsongice/archive/2010/01/22/5225155.aspx
回到mem_init函数中,伙伴系统建立起来以后,free_all_bootmem返回空闲页面的总数给全局参数totalram_pages。随后880~886行代码计算被保留的页面数,保存在内部变量reservedpages中。888行,set_highmem_pages_init函数,通过调用add_highpages_work_fn函数初始化876MB以上的高端页面,并把他们加入伙伴系统,最后计算出包含了这些高端页面的新的可用页面的数量totalram_pages。
继续走,890行,让内部变量codesize、datasize、initsize分别等于内核代码段、数据段和初始化相关函数指针空间段的大小。随后894~936行打印相关信息。942~957是一群调试信息,略去。962行save_pg_dir()函数,来自同一文件:
char swsusp_pg_dir[PAGE_SIZE]
__attribute__ ((aligned(PAGE_SIZE)));
static inline void save_pg_dir(void)
{
memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
}
很简单,就是把页全局目录拷贝到全局变量swsusp_pg_dir数组中,做个备份。963行,执行zap_low_mappings(true)函数,这个函数也来自于同一个文件:
void zap_low_mappings(bool early) { int i;
/* * Zap initial low-memory mappings. * * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); #endif }
if (early) __flush_tlb(); else flush_tlb_all(); } |
这个函数很简单,就是把前面我们在arch/x86/kernel/head_32.S中设置的页全局目录的前若干项清零。这若干项到底是多少项呢?我们看看KERNEL_PGD_BOUNDARY是什么东西:
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#define PGDIR_SHIFT 22
#define PTRS_PER_PGD 1024
不错,0xc0000000>>22 & 1023= 768,这些也全局目录项代表虚拟地址前3G的页面,也就是所谓的用户区,我们在这里把它全清零了。