linux页帧管理采用伙伴系统算法,与bootmem相比有更快的分配速度,能够快速打找到空闲内存;有效的解决外部碎片。
页帧的处理代码主要在:mm/page_alloc.c中
I.主要数据结构
i.页描述符struct page
内存管理的核心是页帧管理,内核必须清楚的知道页帧的状态:该页帧是否空闲,是否已经包含代码或数据,是否已经被修改等。
每个页帧都有一个页描述符与之对应,用来表示页帧信息;页描述符用page结构表示
33 /*
34 * Each physical page in the system has a struct page associated with
35 * it to keep track of whatever it is we are using the page for at the
36 * moment. Note that we have no way to track which tasks are using
37 * a page, though if it is a pagecache page, rmap structures can tell us
38 * who is mapping it.
39 */
40 struct page {
41 unsigned long flags; /* Atomic flags, some possibly
42 * updated asynchronously */
43 atomic_t _count; /* Usage count, see below. */
44 union {
45 atomic_t _mapcount; /* Count of ptes mapped in mms,
46 * to show when page is mapped
47 * & limit reverse map searches.
48 */
49 struct { /* SLUB */
50 u16 inuse;
51 u16 objects;
52 };
53 };
54 union {
55 struct {
56 unsigned long private; /* Mapping-private opaque data:
57 * usually used for buffer_heads
58 * if PagePrivate set; used for
59 * swp_entry_t if PageSwapCache;
60 * indicates order in the buddy
61 * system if PG_buddy is set.
62 */
63 struct address_space *mapping; /* If low bit clear, points to
64 * inode address_space, or NULL.
65 * If page mapped as anonymous
66 * memory, low bit is set, and
67 * it points to anon_vma object:
68 * see PAGE_MAPPING_ANON below.
69 */
70 };
71 #if USE_SPLIT_PTLOCKS
72 spinlock_t ptl;
73 #endif
74 struct kmem_cache *slab; /* SLUB: Pointer to slab */
75 struct page *first_page; /* Compound tail pages */
76 };
77 union {
78 pgoff_t index; /* Our offset within mapping. */
79 void *freelist; /* SLUB: freelist req. slab lock */
80 };
81 struct list_head lru; /* Pageout list, eg. active_list
82 * protected by zone->lru_lock !
83 */
84 /*
85 * On machines where all RAM is mapped into kernel address space,
86 * we can simply calculate the virtual address. On machines with
87 * highmem some memory is mapped into kernel virtual memory
88 * dynamically, so we need a place to store that address.
89 * Note that this field could be 16 bits on x86 ... ;)
90 *
91 * Architectures with slow multiplication can define
92 * WANT_PAGE_VIRTUAL in asm/page.h
93 */
94 #if defined(WANT_PAGE_VIRTUAL)
95 void *virtual; /* Kernel virtual address (NULL if
96 not kmapped, ie. highmem) */
97 #endif /* WANT_PAGE_VIRTUAL */
98 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
99 unsigned long debug_flags; /* Use atomic bitops on this */
100 #endif
101
102 #ifdef CONFIG_KMEMCHECK
103 /*
104 * kmemcheck wants to track the status of each byte in a page; this
105 * is a pointer to such a status block. NULL if not tracked.
106 */
107 void *shadow;
108 #endif
109 };
flags:页标志位,表示页帧当前状态;如PG_buddy表示该页帧属于伙伴系统
lru:将page描述符串联起来组成链表;如伙伴系统中将空闲内存块首个页描述符通过lru组成空闲内存块链表
private:页私有数据,当用于伙伴系统时用来存储order大小
ii.区域描述符struct zone
计算机的体系结构对页帧的使用有一些限制,不是每个页帧都能使用在任意情况(如ISA总线的DMA只能使用低16MB、32位CPU的地址空间不能访问所有的物理内存),linux使用区域概念对页帧进行分组;
X86中主要分为以下几个区域:
ZONE_DMA:16MB以下
ZONE_NORMAL:内核的直接映射区,16MB~896MB
ZONE_HIGHMEM:不能直接映射区,896MB以上
每个区域用区域描述符表示:
286 struct zone {
287 /* Fields commonly accessed by the page allocator */
288
289 /* zone watermarks, access with *_wmark_pages(zone) macros */
290 unsigned long watermark[NR_WMARK];
291
292 /*
293 * When free pages are below this point, additional steps are taken
294 * when reading the number of free pages to avoid per-cpu counter
295 * drift allowing watermarks to be breached
296 */
297 unsigned long percpu_drift_mark;
298
299 /*
300 * We don't know if the memory that we're going to allocate will be freeable
301 * or/and it will be released eventually, so to avoid totally wasting several
302 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
303 * to run OOM on the lower zones despite there's tons of freeable ram
304 * on the higher zones). This array is recalculated at runtime if the
305 * sysctl_lowmem_reserve_ratio sysctl changes.
306 */
307 unsigned long lowmem_reserve[MAX_NR_ZONES];
308
309 #ifdef CONFIG_NUMA
310 int node;
311 /*
312 * zone reclaim becomes active if more unmapped pages exist.
313 */
314 unsigned long min_unmapped_pages;
315 unsigned long min_slab_pages;
316 struct per_cpu_pageset *pageset[NR_CPUS];
317 #else
318 struct per_cpu_pageset pageset[NR_CPUS];
319 #endif
320 /*
321 * free areas of different sizes
322 */
323 spinlock_t lock;
324 #ifdef CONFIG_MEMORY_HOTPLUG
325 /* see spanned/present_pages for more description */
326 seqlock_t span_seqlock;
327 #endif
328 struct free_area free_area[MAX_ORDER];
329
330 #ifndef CONFIG_SPARSEMEM
331 /*
332 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
333 * In SPARSEMEM, this map is stored in struct mem_section
334 */
335 unsigned long *pageblock_flags;
336 #endif /* CONFIG_SPARSEMEM */
337
338
339 ZONE_PADDING(_pad1_)
340
341 /* Fields commonly accessed by the page reclaim scanner */
342 spinlock_t lru_lock;
343 struct zone_lru {
344 struct list_head list;
345 } lru[NR_LRU_LISTS];
346
347 struct zone_reclaim_stat reclaim_stat;
348
349 unsigned long pages_scanned; /* since last reclaim */
350 unsigned long flags; /* zone flags, see below */
351
352 /* Zone statistics */
353 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
354
355 /*
356 * prev_priority holds the scanning priority for this zone. It is
357 * defined as the scanning priority at which we achieved our reclaim
358 * target at the previous try_to_free_pages() or balance_pgdat()
359 * invokation.
360 *
361 * We use prev_priority as a measure of how much stress page reclaim is
362 * under - it drives the swappiness decision: whether to unmap mapped
363 * pages.
364 *
365 * Access to both this field is quite racy even on uniprocessor. But
366 * it is expected to average out OK.
367 */
368 int prev_priority;
369
370 /*
371 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
372 * this zone's LRU. Maintained by the pageout code.
373 */
374 unsigned int inactive_ratio;
375
376
377 ZONE_PADDING(_pad2_)
378 /* Rarely used or read-mostly fields */
379
380 /*
381 * wait_table -- the array holding the hash table
382 * wait_table_hash_nr_entries -- the size of the hash table array
383 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
384 *
385 * The purpose of all these is to keep track of the people
386 * waiting for a page to become available and make them
387 * runnable again when possible. The trouble is that this
388 * consumes a lot of space, especially when so few things
389 * wait on pages at a given time. So instead of using
390 * per-page waitqueues, we use a waitqueue hash table.
391 *
392 * The bucket discipline is to sleep on the same queue when
393 * colliding and wake all in that wait queue when removing.
394 * When something wakes, it must check to be sure its page is
395 * truly available, a la thundering herd. The cost of a
396 * collision is great, but given the expected load of the
397 * table, they should be so rare as to be outweighed by the
398 * benefits from the saved space.
399 *
400 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
401 * primary users of these fields, and in mm/page_alloc.c
402 * free_area_init_core() performs the initialization of them.
403 */
404 wait_queue_head_t * wait_table;
405 unsigned long wait_table_hash_nr_entries;
406 unsigned long wait_table_bits;
407
408 /*
409 * Discontig memory support fields.
410 */
411 struct pglist_data *zone_pgdat;
412 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
413 unsigned long zone_start_pfn;
414
415 /*
416 * zone_start_pfn, spanned_pages and present_pages are all
417 * protected by span_seqlock. It is a seqlock because it has
418 * to be read outside of zone->lock, and it is done in the main
419 * allocator path. But, it is written quite infrequently.
420 *
421 * The lock is declared along with zone->lock because it is
422 * frequently read in proximity to zone->lock. It's good to
423 * give them a chance of being in the same cacheline.
424 */
425 unsigned long spanned_pages; /* total size, including holes */
426 unsigned long present_pages; /* amount of memory (excluding holes) */
427
428 /*
429 * rarely used fields:
430 */
431 const char *name;
432 } ____cacheline_internodealigned_in_smp;
free_area:表示伙伴系统的空闲内存;每个区域采用不同的伙伴系统
pageset:zone的per-CPU缓存
zone_pgdat:zone所属节点
zone_start_pfn:zone的起始页帧
spanned_pages:zone的页帧数量,包括holes
present_pages:zone的实际页帧数量,不包括holes
name:区域名称
iii.节点描述符struct pglist_data
NUMA是指Non-Uniform Memory Access,在支持NUMA的平台中,一个特定的CPU访问不同地址内存所需的时间不同。
根据访问时间不同将内存分成若干个节点,特定CPU对节点内的内存访问时间相等,但是对不同CPU来说不一定相等;
linux2.6以后对NUMA支持,每个节点用一个pg_data_t结构表示;
include/linux/mmzone.h
614 /*
615 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
616 * (mostly NUMA machines?) to denote a higher-level memory zone than the
617 * zone denotes.
618 *
619 * On NUMA machines, each NUMA node would have a pg_data_t to describe
620 * it's memory layout.
621 *
622 * Memory statistics and page replacement data structures are maintained on a
623 * per-zone basis.
624 */
625 struct bootmem_data;
626 typedef struct pglist_data {
627 struct zone node_zones[MAX_NR_ZONES];
628 struct zonelist node_zonelists[MAX_ZONELISTS];
629 int nr_zones;
630 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
631 struct page *node_mem_map;
632 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
633 struct page_cgroup *node_page_cgroup;
634 #endif
635 #endif
636 struct bootmem_data *bdata;
637 #ifdef CONFIG_MEMORY_HOTPLUG
638 /*
639 * Must be held any time you expect node_start_pfn, node_present_pages
640 * or node_spanned_pages stay constant. Holding this will also
641 * guarantee that any pfn_valid() stays that way.
642 *
643 * Nests above zone->lock and zone->size_seqlock.
644 */
645 spinlock_t node_size_lock;
646 #endif
647 unsigned long node_start_pfn;
648 unsigned long node_present_pages; /* total number of physical pages */
649 unsigned long node_spanned_pages; /* total size of physical page
650 range, including holes */
651 int node_id;
652 wait_queue_head_t kswapd_wait;
653 struct task_struct *kswapd;
654 int kswapd_max_order;
655 } pg_data_t;
node_zones:该节点区域描述符数组
node_zonelists:使用区域内存分配器时,分配内存时表示使用区域的先后顺序;顺序一般是ZONE_HIGHMEM->ZONE_NORMAL->ZONE_DMA
nr_zones;区域数
node_mem_map:节点页帧描述符数组
node_start_pfn:节点起始页帧
node_present_pages:节点页帧数目,不包括holes
node_spanned_pages:节点页帧数目,包括holes
node_id:节点ID
X86采用UMA模型,可以当作只有一个节点NUMA的特殊情况,这样就使内核的代码可重用性更高。
include/linux/mmzone.h
788 #ifndef CONFIG_NEED_MULTIPLE_NODES
789
790 extern struct pglist_data contig_page_data;
791 #define NODE_DATA(nid) (&contig_page_data)
mm/page_alloc.c
4472 #ifndef CONFIG_NEED_MULTIPLE_NODES
4473 struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
4474 EXPORT_SYMBOL(contig_page_data);
4475 #endif
II.页帧管理初始化
i.节点初始化
pg_data_t初始化:
start_kernel->setup_arch->paging_init->zone_sizes_init->free_area_init_nodes->free_area_init_node
3922 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3923 unsigned long node_start_pfn, unsigned long *zholes_size)
3924 {
3925 pg_data_t *pgdat = NODE_DATA(nid);
3926
3927 pgdat->node_id = nid;
3928 pgdat->node_start_pfn = node_start_pfn;
3929 calculate_node_totalpages(pgdat, zones_size, zholes_size);
3930
3931 alloc_node_mem_map(pgdat);
3932 #ifdef CONFIG_FLAT_NODE_MEM_MAP
3933 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
3934 nid, (unsigned long)pgdat,
3935 (unsigned long)pgdat->node_mem_map);
3936 #endif
3937
3938 free_area_init_core(pgdat, zones_size, zholes_size);
3939 }
3783 /*
3784 * Set up the zone data structures:
3785 * - mark all pages reserved
3786 * - mark all memory queues empty
3787 * - clear the memory bitmaps
3788 */
3789 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3790 unsigned long *zones_size, unsigned long *zholes_size)
3791 {
3792 enum zone_type j;
3793 int nid = pgdat->node_id;
3794 unsigned long zone_start_pfn = pgdat->node_start_pfn;
3795 int ret;
3796
3797 pgdat_resize_init(pgdat);
3798 pgdat->nr_zones = 0;
3799 init_waitqueue_head(&pgdat->kswapd_wait);
3800 pgdat->kswapd_max_order = 0;
3801 pgdat_page_cgroup_init(pgdat);
3802
3803 for (j = 0; j < MAX_NR_ZONES; j++) {
3804 struct zone *zone = pgdat->node_zones + j;
3805 unsigned long size, realsize, memmap_pages;
3806 enum lru_list l;
3807
3808 size = zone_spanned_pages_in_node(nid, j, zones_size);
3809 realsize = size - zone_absent_pages_in_node(nid, j,
3810 zholes_size);
3811
3812 /*
3813 * Adjust realsize so that it accounts for how much memory
3814 * is used by this zone for memmap. This affects the watermark
3815 * and per-cpu initialisations
3816 */
3817 memmap_pages =
3818 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3819 if (realsize >= memmap_pages) {
3820 realsize -= memmap_pages;
3821 if (memmap_pages)
3822 printk(KERN_DEBUG
3823 " %s zone: %lu pages used for memmap\n",
3824 zone_names[j], memmap_pages);
3825 } else
3826 printk(KERN_WARNING
3827 " %s zone: %lu pages exceeds realsize %lu\n",
3828 zone_names[j], memmap_pages, realsize);
3829
3830 /* Account for reserved pages */
3831 if (j == 0 && realsize > dma_reserve) {
3832 realsize -= dma_reserve;
3833 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
3834 zone_names[0], dma_reserve);
3835 }
3836
3837 if (!is_highmem_idx(j))
3838 nr_kernel_pages += realsize;
3839 nr_all_pages += realsize;
3840
3841 zone->spanned_pages = size;
3842 zone->present_pages = realsize;
3843 #ifdef CONFIG_NUMA
3844 zone->node = nid;
3845 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
3846 / 100;
3847 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
3848 #endif
3849 zone->name = zone_names[j];
3850 spin_lock_init(&zone->lock);
3851 spin_lock_init(&zone->lru_lock);
3852 zone_seqlock_init(zone);
3853 zone->zone_pgdat = pgdat;
3854
3855 zone->prev_priority = DEF_PRIORITY;
3856
3857 zone_pcp_init(zone);
3858 for_each_lru(l) {
3859 INIT_LIST_HEAD(&zone->lru[l].list);
3860 zone->reclaim_stat.nr_saved_scan[l] = 0;
3861 }
3862 zone->reclaim_stat.recent_rotated[0] = 0;
3863 zone->reclaim_stat.recent_rotated[1] = 0;
3864 zone->reclaim_stat.recent_scanned[0] = 0;
3865 zone->reclaim_stat.recent_scanned[1] = 0;
3866 zap_zone_vm_stats(zone);
3867 zone->flags = 0;
3868 if (!size)
3869 continue;
3870
3871 set_pageblock_order(pageblock_default_order());
3872 setup_usemap(pgdat, zone, size);
3873 ret = init_currently_empty_zone(zone, zone_start_pfn,
3874 size, MEMMAP_EARLY);
3875 BUG_ON(ret);
3876 memmap_init(size, nid, j, zone_start_pfn);
3877 zone_start_pfn += size;
3878 }
3879 }
ii.页描述符数组初始化
mem_map指向pg_data_t->node_mem_map
3015 #ifndef __HAVE_ARCH_MEMMAP_INIT
3016 #define memmap_init(size, nid, zone, start_pfn) \
3017 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3018 #endif
2943 /*
2944 * Initially all pages are reserved - free ones are freed
2945 * up by free_all_bootmem() once the early boot process is
2946 * done. Non-atomic initialization, single-pass.
2947 */
2948 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2949 unsigned long start_pfn, enum memmap_context context)
2950 {
2951 struct page *page;
2952 unsigned long end_pfn = start_pfn + size;
2953 unsigned long pfn;
2954 struct zone *z;
2955
2956 if (highest_memmap_pfn < end_pfn - 1)
2957 highest_memmap_pfn = end_pfn - 1;
2958
2959 z = &NODE_DATA(nid)->node_zones[zone];
2960 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2961 /*
2962 * There can be holes in boot-time mem_map[]s
2963 * handed to this function. They do not
2964 * exist on hotplugged memory.
2965 */
2966 if (context == MEMMAP_EARLY) {
2967 if (!early_pfn_valid(pfn))
2968 continue;
2969 if (!early_pfn_in_nid(pfn, nid))
2970 continue;
2971 }
2972 page = pfn_to_page(pfn);
2973 set_page_links(page, zone, nid, pfn);
2974 mminit_verify_page_links(page, zone, nid, pfn);
2975 init_page_count(page);
2976 reset_page_mapcount(page);
2977 SetPageReserved(page);
2978 /*
2979 * Mark the block movable so that blocks are reserved for
2980 * movable at startup. This will force kernel allocations
2981 * to reserve their blocks rather than leaking throughout
2982 * the address space during boot when many long-lived
2983 * kernel allocations are made. Later some blocks near
2984 * the start are marked MIGRATE_RESERVE by
2985 * setup_zone_migrate_reserve()
2986 *
2987 * bitmap is created for zone's valid pfn range. but memmap
2988 * can be created for invalid pages (for alignment)
2989 * check here not to call set_pageblock_migratetype() against
2990 * pfn out of zone.
2991 */
2992 if ((z->zone_start_pfn <= pfn)
2993 && (pfn < z->zone_start_pfn + z->spanned_pages)
2994 && !(pfn & (pageblock_nr_pages - 1)))
2995 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2986 *
2987 * bitmap is created for zone's valid pfn range. but memmap
2988 * can be created for invalid pages (for alignment)
2989 * check here not to call set_pageblock_migratetype() against
2990 * pfn out of zone.
2991 */
2992 if ((z->zone_start_pfn <= pfn)
2993 && (pfn < z->zone_start_pfn + z->spanned_pages)
2994 && !(pfn & (pageblock_nr_pages - 1)))
2995 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2996
2997 INIT_LIST_HEAD(&page->lru);
2998 #ifdef WANT_PAGE_VIRTUAL
2999 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
3000 if (!is_highmem_idx(zone))
3001 set_page_address(page, __va(pfn << PAGE_SHIFT));
3002 #endif
3003 }
3004 }
iii.区域描述符初始化
3305 __meminit int init_currently_empty_zone(struct zone *zone,
3306 unsigned long zone_start_pfn,
3307 unsigned long size,
3308 enum memmap_context context)
3309 {
3310 struct pglist_data *pgdat = zone->zone_pgdat;
3311 int ret;
3312 ret = zone_wait_table_init(zone, size);
3313 if (ret)
3314 return ret;
3315 pgdat->nr_zones = zone_idx(zone) + 1;
3316
3317 zone->zone_start_pfn = zone_start_pfn;
3318
3319 mminit_dprintk(MMINIT_TRACE, "memmap_init",
3320 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
3321 pgdat->node_id,
3321 pgdat->node_id,
3322 (unsigned long)zone_idx(zone),
3323 zone_start_pfn, (zone_start_pfn + size));
3324
3325 zone_init_free_lists(zone);
3326
3327 return 0;
3328 }
3006 static void __meminit zone_init_free_lists(struct zone *zone)
3007 {
3008 int order, t;
3009 for_each_migratetype_order(order, t) {
3010 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3011 zone->free_area[order].nr_free = 0;
3012 }
3013 }
III.伙伴系统buddy system
外部碎片:频繁的申请与释放不同大小的页帧块,很容易出现很多小空闲页帧块分散在已分配内存块间隙中;导致不能再分配足够大的页帧块,尽管空闲页帧总数很大;比如每隔小于1024距离均有分配出去的内存块,则再也不能取到连续1024个页帧。
伙伴系统主要用于解决外部碎片问题;尽管分配出去的页帧块可能比请求的大,但是不会超出2倍。
linux伙伴系统默认采用11级(0,1,..10)空闲链表,用于将每级空闲页帧块(首个页描述符)串联起来;每级页帧块大小为1<<k(k是级别),页帧块起始地址是1<<k的整数倍;最低级页帧块是一个页帧,最高级页帧块是1<<10个页帧
上级页帧块包含一对下级伙伴页帧块:一个上级页帧块可以拆分成两个下级伙伴页帧块,两个下级伙伴页帧块可以合并成一个上级页帧块
在内存分配过程中,对过大的块做伙伴拆分操作,并将分离的伙伴下级加入到相应级别的空闲链表中,直到不能再拆分为止;
在内存回收过程中,做下级伙伴合并操作,直到最高级或不能再合并为止。
i.分配内存块
内存块分配过程中,从小到大依次搜索空闲内存块链表,直到找到空闲内存块或不存在空闲内存块;
如果找到的内存块与申请大小正好相等,则直接返回内存块首个页帧对应的页描述符;
如果不相等,将超出部分的递归除2分割后加到相应的空闲链表中。
比如查找到8个连续页帧[12345678],申请大小为2;首先将[5678]加到order=2的空闲链表中,再将[34]加到order=1的空闲链表中;返回[1]的页描述符。
__rmqueue用于查找区域内的空闲块,成功返回空闲块首页帧的页描述符;失败返回NULL。在调用__rmqueue之前,获取区域锁以保护伙伴系统数据。
643 /*
644 * The order of subdivision here is critical for the IO subsystem.
645 * Please do not alter this order without good reasons and regression
646 * testing. Specifically, as large blocks of memory are subdivided,
647 * the order in which smaller blocks are delivered depends on the order
648 * they're subdivided in this function. This is the primary factor
649 * influencing the order in which pages are delivered to the IO
650 * subsystem according to empirical testing, and this is also justified
651 * by considering the behavior of a buddy system containing a single
652 * large block of memory acted on by a series of small allocations.
653 * This behavior is a critical factor in sglist merging's success.
654 *
655 * -- wli
656 */
657 static inline void expand(struct zone *zone, struct page *page,
658 int low, int high, struct free_area *area,
659 int migratetype)
660 {
661 unsigned long size = 1 << high;
662
663 while (high > low) {
664 area--;
665 high--;
666 size >>= 1;
667 VM_BUG_ON(bad_range(zone, &page[size]));
668 list_add(&page[size].lru, &area->free_list[migratetype]);
669 area->nr_free++;
670 set_page_order(&page[size], high);
671 }
672 }
714 /*
715 * Go through the free lists for the given migratetype and remove
716 * the smallest available page from the freelists
717 */
718 static inline
719 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
720 int migratetype)
721 {
722 unsigned int current_order;
723 struct free_area * area;
724 struct page *page;
725
726 /* Find a page of the appropriate size in the preferred list */
727 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
728 area = &(zone->free_area[current_order]);
729 if (list_empty(&area->free_list[migratetype]))
730 continue;
731
732 page = list_entry(area->free_list[migratetype].next,
733 struct page, lru);
734 list_del(&page->lru);
735 rmv_page_order(page);
736 area->nr_free--;
737 expand(zone, page, order, current_order, area, migratetype);
738 return page;
739 }
740
741 return NULL;
742 }
907 /*
908 * Do the hard work of removing an element from the buddy allocator.
909 * Call me with the zone->lock already held.
910 */
911 static struct page *__rmqueue(struct zone *zone, unsigned int order,
912 int migratetype)
913 {
914 struct page *page;
915
916 retry_reserve:
917 page = __rmqueue_smallest(zone, order, migratetype);
918
919 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
920 page = __rmqueue_fallback(zone, order, migratetype);
921
922 /*
923 * Use MIGRATE_RESERVE rather than fail an allocation. goto
924 * is used because __rmqueue_smallest is an inline function
925 * and we want just one call site
926 */
927 if (!page) {
928 migratetype = MIGRATE_RESERVE;
929 goto retry_reserve;
930 }
931 }
932
933 trace_mm_page_alloc_zone_locked(page, order, migratetype);
934 return page;
935 }
1.从小到大找到第一个满足大小的空闲块;将空闲块从空闲链表中删除,并将空闲块数据减1
2.如果找到的空闲块大小大于申请大小(是申请大小2的指数倍),则将空闲块大小除2后,将后半部加入到对应的空闲块链表中
ii.回收页帧块
回收页帧块主要做伙伴合并,将伙伴从对空闲应链表中删除,并将合并后的页帧块加入到相应的空闲链表中。
1.伙伴判断
include/linux/page-flags.h:
139 #define TESTPAGEFLAG(uname, lname) \
140 static inline int Page##uname(struct page *page) \
141 { return test_bit(PG_##lname, &page->flags); }
235 __PAGEFLAG(Buddy, buddy)
mm/page_alloc.c:
396 /*
397 * This function checks whether a page is free && is the buddy
398 * we can do coalesce a page and its buddy if
399 * (a) the buddy is not in a hole &&
400 * (b) the buddy is in the buddy system &&
401 * (c) a page and its buddy have the same order &&
402 * (d) a page and its buddy are in the same zone.
403 *
404 * For recording whether a page is in the buddy system, we use PG_buddy.
405 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
406 *
407 * For recording page's order, we use page_private(page).
408 */
409 static inline int page_is_buddy(struct page *page, struct page *buddy,
410 int order)
411 {
412 if (!pfn_valid_within(page_to_pfn(buddy)))
413 return 0;
414
415 if (page_zone_id(page) != page_zone_id(buddy))
416 return 0;
417
418 if (PageBuddy(buddy) && page_order(buddy) == order) {
419 VM_BUG_ON(page_count(buddy) != 0);
420 return 1;
421 }
422 return 0;
423 }
判断伙伴的条件:
a.首页帧对应有物理页帧
b.相同区域
c.相同大小
d.属于伙伴系统
2.伙伴查找
365 /*
366 * Locate the struct page for both the matching buddy in our
367 * pair (buddy1) and the combined O(n+1) page they form (page).
368 *
369 * 1) Any buddy B1 will have an order O twin B2 which satisfies
370 * the following equation:
371 * B2 = B1 ^ (1 << O)
372 * For example, if the starting buddy (buddy2) is #8 its order
373 * 1 buddy is #10:
374 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
375 *
376 * 2) Any buddy B will have an order O+1 parent P which
377 * satisfies the following equation:
378 * P = B & ~(1 << O)
379 *
380 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
381 */
382 static inline struct page *
383 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
384 {
385 unsigned long buddy_idx = page_idx ^ (1 << order);
386
387 return page + (buddy_idx - page_idx);
388 }
389
390 static inline unsigned long
391 __find_combined_index(unsigned long page_idx, unsigned int order)
392 {
393 return (page_idx & ~(1 << order));
394 }
__page_find_buddy查找出伙伴页帧块首页描述符;伙伴索引的1<<order位互反,如果为1则伙伴为0,如果为0则伙伴为1;将1<<order位取反后,就可得到伙伴索引
__find_combined_index查找出合并后页帧块首页帧索引;page_idx的1<<order位取0后即为合并后的首页帧索引
3.伙伴合并
425 /*
426 * Freeing function for a buddy system allocator.
427 *
428 * The concept of a buddy system is to maintain direct-mapped table
429 * (containing bit values) for memory blocks of various "orders".
430 * The bottom level table contains the map for the smallest allocatable
431 * units of memory (here, pages), and each level above it describes
432 * pairs of units from the levels below, hence, "buddies".
433 * At a high level, all that happens here is marking the table entry
434 * at the bottom level available, and propagating the changes upward
435 * as necessary, plus some accounting needed to play nicely with other
436 * parts of the VM system.
437 * At each level, we keep a list of pages, which are heads of continuous
438 * free pages of length of (1 << order) and marked with PG_buddy. Page's
439 * order is recorded in page_private(page) field.
440 * So when we are allocating or freeing one, we can derive the state of the
441 * other. That is, if we allocate a small block, and both were
442 * free, the remainder of the region must be split into blocks.
443 * If a block is freed, and its buddy is also free, then this
444 * triggers coalescing into a block of larger size.
445 *
446 * -- wli
447 */
448
449 static inline void __free_one_page(struct page *page,
450 struct zone *zone, unsigned int order,
451 int migratetype)
452 {
453 unsigned long page_idx;
454
455 if (unlikely(PageCompound(page)))
456 if (unlikely(destroy_compound_page(page, order)))
457 return;
458
459 VM_BUG_ON(migratetype == -1);
460
461 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
462
463 VM_BUG_ON(page_idx & ((1 << order) - 1));
464 VM_BUG_ON(bad_range(zone, page));
465
466 while (order < MAX_ORDER-1) {
467 unsigned long combined_idx;
468 struct page *buddy;
469
470 buddy = __page_find_buddy(page, page_idx, order);
471 if (!page_is_buddy(page, buddy, order))
472 break;
473
474 /* Our buddy is free, merge with it and move up one order. */
475 list_del(&buddy->lru);
476 zone->free_area[order].nr_free--;
477 rmv_page_order(buddy);
478 combined_idx = __find_combined_index(page_idx, order);
479 page = page + (combined_idx - page_idx);
480 page_idx = combined_idx;
481 order++;
482 }
483 set_page_order(page, order);
484 list_add(&page->lru,
485 &zone->free_area[order].free_list[migratetype]);
486 zone->free_area[order].nr_free++;
487 }递归查找伙伴直到不能再找到或达到最大值,找到后将伙伴从空闲链表中删除,并做伙伴合并;将最终合并的页帧块加入到相应的空闲链表中
iii.bootmem内存释放
start_kernel->mm_init->mem_init->free_all_bootmem->free_all_bootmem_core->__free_pages_bootmem->__free_pages->free_one_page
614 /*
615 * permit the bootmem allocator to evade page validation on high-order frees
616 */
617 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
618 {
619 if (order == 0) {
620 __ClearPageReserved(page);
621 set_page_count(page, 0);
622 set_page_refcounted(page);
623 __free_page(page);
624 } else {
625 int loop;
626
627 prefetchw(page);
628 for (loop = 0; loop < BITS_PER_LONG; loop++) {
629 struct page *p = &page[loop];
630
631 if (loop + 1 < BITS_PER_LONG)
632 prefetchw(p + 1);
633 __ClearPageReserved(p);
634 set_page_count(p, 0);
635 }
636
637 set_page_refcounted(page);
638 __free_pages(page, order);
639 }
640 }
583 static void __free_pages_ok(struct page *page, unsigned int order)
584 {
585 unsigned long flags;
586 int i;
587 int bad = 0;
588 int wasMlocked = __TestClearPageMlocked(page);
589
590 kmemcheck_free_shadow(page, order);
591
592 for (i = 0 ; i < (1 << order) ; ++i)
593 bad += free_pages_check(page + i);
594 if (bad)
595 return;
596
597 if (!PageHighMem(page)) {
598 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
599 debug_check_no_obj_freed(page_address(page),
600 PAGE_SIZE << order);
601 }
602 arch_free_page(page, order);
603 kernel_map_pages(page, 1 << order, 0);
604
605 local_irq_save(flags);
606 if (unlikely(wasMlocked))
607 free_page_mlock(page);
608 __count_vm_events(PGFREE, 1 << order);
609 free_one_page(page_zone(page), page, order,
610 get_pageblock_migratetype(page));
611 local_irq_restore(flags);
612 }
在节点描述符初始化时将所有的页帧标识为保留;bootmem释放空闲内存时,将页帧描述符标识为空闲,并释放到伙伴系统中或per-CPU缓存中
III.per-CPU页帧缓存
内核经常申请和释放单个页帧,每个zone分配一个per-CPU页帧缓存用来提高性能。
per-CPU页帧缓存中存在预先从伙伴系统或bootmem中分配的页帧,供本CPU申请。
i.数据结构
include/linux/mmzone.h:
169 struct per_cpu_pages {
170 int count; /* number of pages in the list */
171 int high; /* high watermark, emptying needed */
172 int batch; /* chunk size for buddy add/remove */
173
174 /* Lists of pages, one per migrate type stored on the pcp-lists */
175 struct list_head lists[MIGRATE_PCPTYPES];
176 };
177
178 struct per_cpu_pageset {
179 struct per_cpu_pages pcp;
180 #ifdef CONFIG_NUMA
181 s8 expire;
182 #endif
183 #ifdef CONFIG_SMP
184 s8 stat_threshold;
185 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
186 #endif
187 } ____cacheline_aligned_in_smp;
count:缓存中可以分配的页帧数量
high:空闲页帧上限
batch:缓存中没有页帧时从伙伴系统一次分配的页帧数量;缓存超过high时,释放到伙伴系统中页帧数量
lists:页帧链表
ii.per-CPU页帧缓存分配
1175 /*
1176 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1177 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1178 * or two.
1179 */
1180 static inline
1181 struct page *buffered_rmqueue(struct zone *preferred_zone,
1182 struct zone *zone, int order, gfp_t gfp_flags,
1183 int migratetype)
1184 {
1185 unsigned long flags;
1186 struct page *page;
1187 int cold = !!(gfp_flags & __GFP_COLD);
1188 int cpu;
1189
1190 again:
1191 cpu = get_cpu();
1192 if (likely(order == 0)) {
1193 struct per_cpu_pages *pcp;
1194 struct list_head *list;
1195
1196 pcp = &zone_pcp(zone, cpu)->pcp;
1197 list = &pcp->lists[migratetype];
1198 local_irq_save(flags);
1199 if (list_empty(list)) {
1200 pcp->count += rmqueue_bulk(zone, 0,
1201 pcp->batch, list,
1202 migratetype, cold);
1203 if (unlikely(list_empty(list)))
1204 goto failed;
1205 }
1206
1207 if (cold)
1208 page = list_entry(list->prev, struct page, lru);
1209 else
1210 page = list_entry(list->next, struct page, lru);
1211
1212 list_del(&page->lru);
1213 pcp->count--;
1214 } else {
1215 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1216 /*
1217 * __GFP_NOFAIL is not to be used in new code.
1218 *
1219 * All __GFP_NOFAIL callers should be fixed so that they
1220 * properly detect and handle allocation failures.
1221 *
1222 * We most definitely don't want callers attempting to
1223 * allocate greater than order-1 page units with
1224 * __GFP_NOFAIL.
1225 */
1226 WARN_ON_ONCE(order > 1);
1227 }
1228 spin_lock_irqsave(&zone->lock, flags);
1229 page = __rmqueue(zone, order, migratetype);
1230 spin_unlock(&zone->lock);
1231 if (!page)
1232 goto failed;
1233 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1234 }
1235
1236 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1237 zone_statistics(preferred_zone, zone);
1238 local_irq_restore(flags);
1239 put_cpu();
1240
1241 VM_BUG_ON(bad_range(zone, page));
1242 if (prep_new_page(page, order, gfp_flags))
1243 goto again;
1244 return page;
1245
1246 failed:
1247 local_irq_restore(flags);
1248 put_cpu();
1249 return NULL;
1250 }
当申请一个页帧order=0时从per-CPU中取空闲页;
如果per-CPU缓存中没有页帧时,从伙伴系统中申请pcp->batch个页帧添加到per-CPU缓存中;
从缓存中删除申请的页帧,并返回
iii.per-CPU页帧缓存回收
1078 /*
1079 * Free a 0-order page
1080 */
1081 static void free_hot_cold_page(struct page *page, int cold)
1082 {
1083 struct zone *zone = page_zone(page);
1084 struct per_cpu_pages *pcp;
1085 unsigned long flags;
1086 int migratetype;
1087 int wasMlocked = __TestClearPageMlocked(page);
1088
1089 kmemcheck_free_shadow(page, 0);
1090
1091 if (PageAnon(page))
1092 page->mapping = NULL;
1093 if (free_pages_check(page))
1094 return;
1095
1096 if (!PageHighMem(page)) {
1097 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1098 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1099 }
1100 arch_free_page(page, 0);
1101 kernel_map_pages(page, 1, 0);
1102
1103 pcp = &zone_pcp(zone, get_cpu())->pcp;
1104 migratetype = get_pageblock_migratetype(page);
1105 set_page_private(page, migratetype);
1106 local_irq_save(flags);
1107 if (unlikely(wasMlocked))
1108 free_page_mlock(page);
1109 __count_vm_event(PGFREE);
1110
1111 /*
1112 * We only track unmovable, reclaimable and movable on pcp lists.
1113 * Free ISOLATE pages back to the allocator because they are being
1114 * offlined but treat RESERVE as movable pages so we can get those
1115 * areas back if necessary. Otherwise, we may have to free
1116 * excessively into the page allocator
1117 */
1118 if (migratetype >= MIGRATE_PCPTYPES) {
1119 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1120 free_one_page(zone, page, 0, migratetype);
1121 goto out;
1122 }
1123 migratetype = MIGRATE_MOVABLE;
1124 }
1125
1126 if (cold)
1127 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1128 else
1129 list_add(&page->lru, &pcp->lists[migratetype]);
1130 pcp->count++;
1131 if (pcp->count >= pcp->high) {
1132 free_pcppages_bulk(zone, pcp->batch, pcp);
1133 pcp->count -= pcp->batch;
1134 }
1135
1136 out:
1137 local_irq_restore(flags);
1138 put_cpu();
1139 }
2019 void __free_pages(struct page *page, unsigned int order)
2020 {
2021 if (put_page_testzero(page)) {
2022 trace_mm_page_free_direct(page, order);
2023 if (order == 0)
2024 free_hot_page(page);
2025 else
2026 __free_pages_ok(page, order);
2027 }
2028 }
当释放一个页帧order=0时,将页帧释放到per-CPU页帧缓存中;
如果缓存数量超出pcp->high,则将页帧释放到伙伴系统中
III.Zone Allocator
zone分配器是内核页帧分配器的入口;首先根据zonelist确定从哪个zone中分配内存,确定zone后从该zone的per-CPU缓存或伙伴系统中分配页帧;
i.zone内存分配
1、zone内存分配
1935 /*
1936 * This is the 'heart' of the zoned buddy allocator.
1937 */
1938 struct page *
1939 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1940 struct zonelist *zonelist, nodemask_t *nodemask)
1941 {
1942 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1943 struct zone *preferred_zone;
1944 struct page *page;
1945 int migratetype = allocflags_to_migratetype(gfp_mask);
1946
1947 gfp_mask &= gfp_allowed_mask;
1948
1949 lockdep_trace_alloc(gfp_mask);
1950
1951 might_sleep_if(gfp_mask & __GFP_WAIT);
1952
1953 if (should_fail_alloc_page(gfp_mask, order))
1954 return NULL;
1955
1956 /*
1957 * Check the zones suitable for the gfp_mask contain at least one
1958 * valid zone. It's possible to have an empty zonelist as a result
1959 * of GFP_THISNODE and a memoryless node
1960 */
1961 if (unlikely(!zonelist->_zonerefs->zone))
1962 return NULL;
1963
1964 /* The preferred zone is used for statistics later */
1965 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1966 if (!preferred_zone)
1967 return NULL;
1968
1969 /* First allocation attempt */
1970 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1971 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1972 preferred_zone, migratetype);
1973 if (unlikely(!page))
1974 page = __alloc_pages_slowpath(gfp_mask, order,
1975 zonelist, high_zoneidx, nodemask,
1976 preferred_zone, migratetype);
1977
1978 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1979 return page;
1980 }
首先根据zonelist及gfp_mask确定从哪个zone开始分配内存;
然后从该zone的per-CPU缓存或伙伴系统中分配页帧块
2.根据zonelist与gfp_mask查找zone
include/linux/gfp.h:
149 /*
150 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
151 * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
152 * and there are 16 of them to cover all possible combinations of
153 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM
154 *
155 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
156 * But GFP_MOVABLE is not only a zone specifier but also an allocation
157 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
158 * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1".
159 *
160 * bit result
161 * =================
162 * 0x0 => NORMAL
163 * 0x1 => DMA or NORMAL
164 * 0x2 => HIGHMEM or NORMAL
165 * 0x3 => BAD (DMA+HIGHMEM)
166 * 0x4 => DMA32 or DMA or NORMAL
167 * 0x5 => BAD (DMA+DMA32)
168 * 0x6 => BAD (HIGHMEM+DMA32)
169 * 0x7 => BAD (HIGHMEM+DMA32+DMA)
170 * 0x8 => NORMAL (MOVABLE+0)
171 * 0x9 => DMA or NORMAL (MOVABLE+DMA)
172 * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too)
173 * 0xb => BAD (MOVABLE+HIGHMEM+DMA)
174 * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32)
175 * 0xd => BAD (MOVABLE+DMA32+DMA)
176 * 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
177 * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
178 *
179 * ZONES_SHIFT must be <= 2 on 32 bit platforms.
180 */
181
182 #if 16 * ZONES_SHIFT > BITS_PER_LONG
183 #error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
184 #endif
185
186 #define GFP_ZONE_TABLE ( \
187 (ZONE_NORMAL << 0 * ZONES_SHIFT) \
188 | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT) \
189 | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT) \
190 | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT) \
191 | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT) \
192 | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT) \
193 | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
194 | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
195 )
196
197 /*
198 * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32
199 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
200 * entry starting with bit 0. Bit is set if the combination is not
201 * allowed.
202 */
203 #define GFP_ZONE_BAD ( \
204 1 << (__GFP_DMA | __GFP_HIGHMEM) \
205 | 1 << (__GFP_DMA | __GFP_DMA32) \
206 | 1 << (__GFP_DMA32 | __GFP_HIGHMEM) \
207 | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM) \
208 | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA) \
209 | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA) \
210 | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM) \
211 | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
212 )
213
214 static inline enum zone_type gfp_zone(gfp_t flags)
215 {
216 enum zone_type z;
217 int bit = flags & GFP_ZONEMASK;
218
219 z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
220 ((1 << ZONES_SHIFT) - 1);
221
222 if (__builtin_constant_p(bit))
223 MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
224 else {
225 #ifdef CONFIG_DEBUG_VM
226 BUG_ON((GFP_ZONE_BAD >> bit) & 1);
227 #endif
228 }
229 return z;
230 }
include/linux/mmzone.h
870 /**
871 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
872 * @zonelist - The zonelist to search for a suitable zone
873 * @highest_zoneidx - The zone index of the highest zone to return
874 * @nodes - An optional nodemask to filter the zonelist with
875 * @zone - The first suitable zone found is returned via this parameter
876 *
877 * This function returns the first zone at or below a given zone index that is
878 * within the allowed nodemask. The zoneref returned is a cursor that can be
879 * used to iterate the zonelist with next_zones_zonelist by advancing it by
880 * one before calling.
881 */
882 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
883 enum zone_type highest_zoneidx,
884 nodemask_t *nodes,
885 struct zone **zone)
886 {
887 return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
888 zone);
889 }
mm/mmzone.c:
55 /* Returns the next zone at or below highest_zoneidx in a zonelist */
56 struct zoneref *next_zones_zonelist(struct zoneref *z,
57 enum zone_type highest_zoneidx,
58 nodemask_t *nodes,
59 struct zone **zone)
60 {
61 /*
62 * Find the next suitable zone to use for the allocation.
63 * Only filter based on nodemask if it's set
64 */
65 if (likely(nodes == NULL))
66 while (zonelist_zone_idx(z) > highest_zoneidx)
67 z++;
68 else
69 while (zonelist_zone_idx(z) > highest_zoneidx ||
70 (z->zone && !zref_in_nodemask(z, nodes)))
71 z++;
72
73 *zone = zonelist_zone(z);
74 return z;
75 }
__GFP_DMA,__GFP_HIGHMEM,__GFP_DMA32,__GFP_MOVABLE分别对应gfp_mask低4位的1,2,3,4位,所以这四个标志位任意的组合值不会相同;最多会有15种组合,每种组合占ZONES_SHIFT位.
GFP_ZONE_TABLE:配置gfp_mask中ZONE标识组合值对应的zone
GFP_ZONE_BAD:配置不合法的ZONE标识组合值
zonelist:配置分配内存时使用zone的先后顺序,一般是ZONE_HIGHMEM->ZONE_NORMAL->ZONE_DMA;前一个zone分配失败后才会用后一个zone去分配,如只有ZOME_HIGHMEM和ZOME_NORMAL没有空闲内存时才会从ZOME_DMA中分配
gfp_zone:根据gfp_mask中zone标识(__GFP_DMA,__GFP_HIGHMEM,__GFP_DMA32,__GFP_MOVABLE)和ZONETABLE的配置取出从相应的zone索引;
first_zones_zonelist:找出低于或等于high_zoneidx的第一个zone
ii.zone内存释放
2019 void __free_pages(struct page *page, unsigned int order)
2020 {
2021 if (put_page_testzero(page)) {
2022 trace_mm_page_free_direct(page, order);
2023 if (order == 0)
2024 free_hot_page(page);
2025 else
2026 __free_pages_ok(page, order);
2027 }
2028 }
释放页帧到per-CPU缓存或伙伴系统中
本文深入探讨Linux内存管理的页帧管理,重点介绍伙伴系统如何有效地解决外部碎片问题。页帧通过页描述符表示,区域描述符按区域分组,节点描述符用于NUMA支持。伙伴系统采用11级空闲链表,通过拆分和合并页帧块,确保高效分配和回收内存。此外,还涉及per-CPU页帧缓存以优化单个页帧的分配和回收。
1899

被折叠的 条评论
为什么被折叠?



