linux内存管理页帧管理

最新推荐文章于 2024-07-21 01:31:58 发布

原创最新推荐文章于 2024-07-21 01:31:58 发布 · 3.8k 阅读

2 ·

CC 4.0 BY-SA版权

k-mm 专栏收录该内容

3 篇文章

订阅专栏

本文深入探讨Linux内存管理的页帧管理，重点介绍伙伴系统如何有效地解决外部碎片问题。页帧通过页描述符表示，区域描述符按区域分组，节点描述符用于NUMA支持。伙伴系统采用11级空闲链表，通过拆分和合并页帧块，确保高效分配和回收内存。此外，还涉及per-CPU页帧缓存以优化单个页帧的分配和回收。

linux页帧管理采用伙伴系统算法，与bootmem相比有更快的分配速度，能够快速打找到空闲内存；有效的解决外部碎片。
页帧的处理代码主要在：mm/page_alloc.c中

I.主要数据结构
i.页描述符struct page
内存管理的核心是页帧管理，内核必须清楚的知道页帧的状态：该页帧是否空闲，是否已经包含代码或数据，是否已经被修改等。
每个页帧都有一个页描述符与之对应，用来表示页帧信息；页描述符用page结构表示

 33 /*
 34  * Each physical page in the system has a struct page associated with
 35  * it to keep track of whatever it is we are using the page for at the
 36  * moment. Note that we have no way to track which tasks are using
 37  * a page, though if it is a pagecache page, rmap structures can tell us
 38  * who is mapping it.
 39  */
 40 struct page {
 41         unsigned long flags;            /* Atomic flags, some possibly
 42                                          * updated asynchronously */
 43         atomic_t _count;                /* Usage count, see below. */
 44         union {
 45                 atomic_t _mapcount;     /* Count of ptes mapped in mms,
 46                                          * to show when page is mapped
 47                                          * & limit reverse map searches.
 48                                          */
 49                 struct {                /* SLUB */
 50                         u16 inuse;
 51                         u16 objects;
 52                 };
 53         };
 54         union {
 55             struct {
 56                 unsigned long private;          /* Mapping-private opaque data:
 57                                                  * usually used for buffer_heads
 58                                                  * if PagePrivate set; used for
 59                                                  * swp_entry_t if PageSwapCache;
 60                                                  * indicates order in the buddy
 61                                                  * system if PG_buddy is set.
 62                                                  */
 63                 struct address_space *mapping;  /* If low bit clear, points to
 64                                                  * inode address_space, or NULL.
 65                                                  * If page mapped as anonymous
 66                                                  * memory, low bit is set, and
 67                                                  * it points to anon_vma object:
 68                                                  * see PAGE_MAPPING_ANON below.
 69                                                  */
 70             };
 71 #if USE_SPLIT_PTLOCKS
 72             spinlock_t ptl;
 73 #endif
 74             struct kmem_cache *slab;    /* SLUB: Pointer to slab */
 75             struct page *first_page;    /* Compound tail pages */
 76         };
 77         union {
 78                 pgoff_t index;          /* Our offset within mapping. */
 79                 void *freelist;         /* SLUB: freelist req. slab lock */
 80         };
 81         struct list_head lru;           /* Pageout list, eg. active_list
 82                                          * protected by zone->lru_lock !
 83                                          */
 84         /*
 85          * On machines where all RAM is mapped into kernel address space,
 86          * we can simply calculate the virtual address. On machines with
 87          * highmem some memory is mapped into kernel virtual memory
 88          * dynamically, so we need a place to store that address.
 89          * Note that this field could be 16 bits on x86 ... ;)
 90          *
 91          * Architectures with slow multiplication can define
 92          * WANT_PAGE_VIRTUAL in asm/page.h
 93          */
 94 #if defined(WANT_PAGE_VIRTUAL)
 95         void *virtual;                  /* Kernel virtual address (NULL if
 96                                            not kmapped, ie. highmem) */
 97 #endif /* WANT_PAGE_VIRTUAL */
 98 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
 99         unsigned long debug_flags;      /* Use atomic bitops on this */
100 #endif
101 
102 #ifdef CONFIG_KMEMCHECK
103         /*
104          * kmemcheck wants to track the status of each byte in a page; this
105          * is a pointer to such a status block. NULL if not tracked.
106          */
107         void *shadow;
108 #endif
109 };

flags:页标志位，表示页帧当前状态；如PG_buddy表示该页帧属于伙伴系统
lru：将page描述符串联起来组成链表；如伙伴系统中将空闲内存块首个页描述符通过lru组成空闲内存块链表
private:页私有数据，当用于伙伴系统时用来存储order大小

ii.区域描述符struct zone
计算机的体系结构对页帧的使用有一些限制，不是每个页帧都能使用在任意情况（如ISA总线的DMA只能使用低16MB、32位CPU的地址空间不能访问所有的物理内存），linux使用区域概念对页帧进行分组；
X86中主要分为以下几个区域：
ZONE_DMA：16MB以下
ZONE_NORMAL：内核的直接映射区，16MB~896MB
ZONE_HIGHMEM：不能直接映射区，896MB以上
每个区域用区域描述符表示：

 286 struct zone {
 287         /* Fields commonly accessed by the page allocator */
 288 
 289         /* zone watermarks, access with *_wmark_pages(zone) macros */
 290         unsigned long watermark[NR_WMARK];
 291 
 292         /*
 293          * When free pages are below this point, additional steps are taken
 294          * when reading the number of free pages to avoid per-cpu counter
 295          * drift allowing watermarks to be breached
 296          */
 297         unsigned long percpu_drift_mark;
 298 
 299         /*
 300          * We don't know if the memory that we're going to allocate will be freeable
 301          * or/and it will be released eventually, so to avoid totally wasting several
 302          * GB of ram we must reserve some of the lower zone memory (otherwise we risk
 303          * to run OOM on the lower zones despite there's tons of freeable ram
 304          * on the higher zones). This array is recalculated at runtime if the
 305          * sysctl_lowmem_reserve_ratio sysctl changes.
 306          */
 307         unsigned long           lowmem_reserve[MAX_NR_ZONES];
 308 
 309 #ifdef CONFIG_NUMA
 310         int node;
 311         /*
 312          * zone reclaim becomes active if more unmapped pages exist.
 313          */
 314         unsigned long           min_unmapped_pages;
 315         unsigned long           min_slab_pages;
 316         struct per_cpu_pageset  *pageset[NR_CPUS];
 317 #else
 318         struct per_cpu_pageset  pageset[NR_CPUS];
 319 #endif
 320         /*
 321          * free areas of different sizes
 322          */
 323         spinlock_t              lock;
 324 #ifdef CONFIG_MEMORY_HOTPLUG
 325         /* see spanned/present_pages for more description */
 326         seqlock_t               span_seqlock;
 327 #endif
 328         struct free_area        free_area[MAX_ORDER];
 329 
 330 #ifndef CONFIG_SPARSEMEM
 331         /*
 332          * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
 333          * In SPARSEMEM, this map is stored in struct mem_section
 334          */
 335         unsigned long           *pageblock_flags;
 336 #endif /* CONFIG_SPARSEMEM */
 337 
 338 
 339         ZONE_PADDING(_pad1_)
 340 
 341         /* Fields commonly accessed by the page reclaim scanner */
 342         spinlock_t              lru_lock;
 343         struct zone_lru {
 344                 struct list_head list;
 345         } lru[NR_LRU_LISTS];
 346 
 347         struct zone_reclaim_stat reclaim_stat;
 348 
 349         unsigned long           pages_scanned;     /* since last reclaim */
 350         unsigned long           flags;             /* zone flags, see below */
 351 
 352         /* Zone statistics */
 353         atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
 354 
 355         /*
 356          * prev_priority holds the scanning priority for this zone.  It is
 357          * defined as the scanning priority at which we achieved our reclaim
 358          * target at the previous try_to_free_pages() or balance_pgdat()
 359          * invokation.
 360          *
 361          * We use prev_priority as a measure of how much stress page reclaim is
 362          * under - it drives the swappiness decision: whether to unmap mapped
 363          * pages.
 364          *
 365          * Access to both this field is quite racy even on uniprocessor.  But
 366          * it is expected to average out OK.
 367          */
 368         int prev_priority;
 369 
 370         /*
 371          * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
 372          * this zone's LRU.  Maintained by the pageout code.
 373          */
 374         unsigned int inactive_ratio;
 375 
 376 
 377         ZONE_PADDING(_pad2_)
 378         /* Rarely used or read-mostly fields */
 379 
 380         /*
 381          * wait_table           -- the array holding the hash table
 382          * wait_table_hash_nr_entries   -- the size of the hash table array
 383          * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
 384          *
 385          * The purpose of all these is to keep track of the people
 386          * waiting for a page to become available and make them
 387          * runnable again when possible. The trouble is that this
 388          * consumes a lot of space, especially when so few things
 389          * wait on pages at a given time. So instead of using
 390          * per-page waitqueues, we use a waitqueue hash table.
 391          *
 392          * The bucket discipline is to sleep on the same queue when
 393          * colliding and wake all in that wait queue when removing.
 394          * When something wakes, it must check to be sure its page is
 395          * truly available, a la thundering herd. The cost of a
 396          * collision is great, but given the expected load of the
 397          * table, they should be so rare as to be outweighed by the
 398          * benefits from the saved space.
 399          *
 400          * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
 401          * primary users of these fields, and in mm/page_alloc.c
 402          * free_area_init_core() performs the initialization of them.
 403          */
 404         wait_queue_head_t       * wait_table;
 405         unsigned long           wait_table_hash_nr_entries;
 406         unsigned long           wait_table_bits;
 407 
 408         /*
 409          * Discontig memory support fields.
 410          */
 411         struct pglist_data      *zone_pgdat;
 412         /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 413         unsigned long           zone_start_pfn;
 414 
 415         /*
 416          * zone_start_pfn, spanned_pages and present_pages are all
 417          * protected by span_seqlock.  It is a seqlock because it has
 418          * to be read outside of zone->lock, and it is done in the main
 419          * allocator path.  But, it is written quite infrequently.
 420          *
 421          * The lock is declared along with zone->lock because it is
 422          * frequently read in proximity to zone->lock.  It's good to
 423          * give them a chance of being in the same cacheline.
 424          */
 425         unsigned long           spanned_pages;  /* total size, including holes */
 426         unsigned long           present_pages;  /* amount of memory (excluding holes) */
 427 
 428         /*
 429          * rarely used fields:
 430          */
 431         const char              *name;
 432 } ____cacheline_internodealigned_in_smp;

free_area：表示伙伴系统的空闲内存；每个区域采用不同的伙伴系统
pageset:zone的per-CPU缓存
zone_pgdat:zone所属节点
zone_start_pfn：zone的起始页帧
spanned_pages：zone的页帧数量，包括holes
present_pages：zone的实际页帧数量，不包括holes
name：区域名称

iii.节点描述符struct pglist_data
NUMA是指Non-Uniform Memory Access，在支持NUMA的平台中，一个特定的CPU访问不同地址内存所需的时间不同。
根据访问时间不同将内存分成若干个节点，特定CPU对节点内的内存访问时间相等，但是对不同CPU来说不一定相等；
linux2.6以后对NUMA支持，每个节点用一个pg_data_t结构表示；

include/linux/mmzone.h
 614 /*
 615  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
 616  * (mostly NUMA machines?) to denote a higher-level memory zone than the
 617  * zone denotes.
 618  *
 619  * On NUMA machines, each NUMA node would have a pg_data_t to describe
 620  * it's memory layout.
 621  *
 622  * Memory statistics and page replacement data structures are maintained on a
 623  * per-zone basis.
 624  */
 625 struct bootmem_data;
 626 typedef struct pglist_data {
 627         struct zone node_zones[MAX_NR_ZONES];
 628         struct zonelist node_zonelists[MAX_ZONELISTS];
 629         int nr_zones;
 630 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
 631         struct page *node_mem_map;
 632 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 633         struct page_cgroup *node_page_cgroup;
 634 #endif
 635 #endif
 636         struct bootmem_data *bdata;
 637 #ifdef CONFIG_MEMORY_HOTPLUG
 638         /*
 639          * Must be held any time you expect node_start_pfn, node_present_pages
 640          * or node_spanned_pages stay constant.  Holding this will also
 641          * guarantee that any pfn_valid() stays that way.
 642          *
 643          * Nests above zone->lock and zone->size_seqlock.
 644          */
 645         spinlock_t node_size_lock;
 646 #endif
 647         unsigned long node_start_pfn;
 648         unsigned long node_present_pages; /* total number of physical pages */
 649         unsigned long node_spanned_pages; /* total size of physical page
 650                                              range, including holes */
 651         int node_id;
 652         wait_queue_head_t kswapd_wait;
 653         struct task_struct *kswapd;
 654         int kswapd_max_order;
 655 } pg_data_t;

node_zones:该节点区域描述符数组
node_zonelists：使用区域内存分配器时，分配内存时表示使用区域的先后顺序；顺序一般是ZONE_HIGHMEM->ZONE_NORMAL->ZONE_DMA
nr_zones；区域数
node_mem_map：节点页帧描述符数组
node_start_pfn：节点起始页帧
node_present_pages：节点页帧数目，不包括holes
node_spanned_pages：节点页帧数目，包括holes
node_id：节点ID

X86采用UMA模型，可以当作只有一个节点NUMA的特殊情况，这样就使内核的代码可重用性更高。

include/linux/mmzone.h
 788 #ifndef CONFIG_NEED_MULTIPLE_NODES
 789 
 790 extern struct pglist_data contig_page_data;
 791 #define NODE_DATA(nid)          (&contig_page_data)
mm/page_alloc.c
4472 #ifndef CONFIG_NEED_MULTIPLE_NODES
4473 struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
4474 EXPORT_SYMBOL(contig_page_data);
4475 #endif

II.页帧管理初始化
i.节点初始化
pg_data_t初始化：

start_kernel->setup_arch->paging_init->zone_sizes_init->free_area_init_nodes->free_area_init_node
3922 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
3923                 unsigned long node_start_pfn, unsigned long *zholes_size)
3924 {
3925         pg_data_t *pgdat = NODE_DATA(nid);
3926 
3927         pgdat->node_id = nid;
3928         pgdat->node_start_pfn = node_start_pfn;
3929         calculate_node_totalpages(pgdat, zones_size, zholes_size);
3930 
3931         alloc_node_mem_map(pgdat);
3932 #ifdef CONFIG_FLAT_NODE_MEM_MAP
3933         printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
3934                 nid, (unsigned long)pgdat,
3935                 (unsigned long)pgdat->node_mem_map);
3936 #endif
3937 
3938         free_area_init_core(pgdat, zones_size, zholes_size);
3939 }
3783 /*
3784  * Set up the zone data structures:
3785  *   - mark all pages reserved
3786  *   - mark all memory queues empty
3787  *   - clear the memory bitmaps
3788  */
3789 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3790                 unsigned long *zones_size, unsigned long *zholes_size)
3791 {
3792         enum zone_type j;
3793         int nid = pgdat->node_id;
3794         unsigned long zone_start_pfn = pgdat->node_start_pfn;
3795         int ret;
3796 
3797         pgdat_resize_init(pgdat);
3798         pgdat->nr_zones = 0;
3799         init_waitqueue_head(&pgdat->kswapd_wait);
3800         pgdat->kswapd_max_order = 0;
3801         pgdat_page_cgroup_init(pgdat);
3802 
3803         for (j = 0; j < MAX_NR_ZONES; j++) {
3804                 struct zone *zone = pgdat->node_zones + j;
3805                 unsigned long size, realsize, memmap_pages;
3806                 enum lru_list l;
3807 
3808                 size = zone_spanned_pages_in_node(nid, j, zones_size);
3809                 realsize = size - zone_absent_pages_in_node(nid, j,
3810                                                                 zholes_size);
3811 
3812                 /*
3813                  * Adjust realsize so that it accounts for how much memory
3814                  * is used by this zone for memmap. This affects the watermark
3815                  * and per-cpu initialisations
3816                  */
3817                 memmap_pages =
3818                         PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3819                 if (realsize >= memmap_pages) {
3820                         realsize -= memmap_pages;
3821                         if (memmap_pages)
3822                                 printk(KERN_DEBUG
3823                                        "  %s zone: %lu pages used for memmap\n",
3824                                        zone_names[j], memmap_pages);
3825                 } else
3826                         printk(KERN_WARNING
3827                                 "  %s zone: %lu pages exceeds realsize %lu\n",
3828                                 zone_names[j], memmap_pages, realsize);
3829 
3830                 /* Account for reserved pages */
3831                 if (j == 0 && realsize > dma_reserve) {
3832                         realsize -= dma_reserve;
3833                         printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
3834                                         zone_names[0], dma_reserve);
3835                 }
3836 
3837                 if (!is_highmem_idx(j))
3838                         nr_kernel_pages += realsize;
3839                 nr_all_pages += realsize;
3840 
3841                 zone->spanned_pages = size;
3842                 zone->present_pages = realsize;
3843 #ifdef CONFIG_NUMA
3844                 zone->node = nid;
3845                 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
3846                                                 / 100;
3847                 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
3848 #endif
3849                 zone->name = zone_names[j];
3850                 spin_lock_init(&zone->lock);
3851                 spin_lock_init(&zone->lru_lock);
3852                 zone_seqlock_init(zone);
3853                 zone->zone_pgdat = pgdat;
3854 
3855                 zone->prev_priority = DEF_PRIORITY;
3856 
3857                 zone_pcp_init(zone);
3858                 for_each_lru(l) {
3859                         INIT_LIST_HEAD(&zone->lru[l].list);
3860                         zone->reclaim_stat.nr_saved_scan[l] = 0;
3861                 }
3862                 zone->reclaim_stat.recent_rotated[0] = 0;
3863                 zone->reclaim_stat.recent_rotated[1] = 0;
3864                 zone->reclaim_stat.recent_scanned[0] = 0;
3865                 zone->reclaim_stat.recent_scanned[1] = 0;
3866                 zap_zone_vm_stats(zone);
3867                 zone->flags = 0;
3868                 if (!size)
3869                         continue;
3870 
3871                 set_pageblock_order(pageblock_default_order());
3872                 setup_usemap(pgdat, zone, size);
3873                 ret = init_currently_empty_zone(zone, zone_start_pfn,
3874                                                 size, MEMMAP_EARLY);
3875                 BUG_ON(ret);
3876                 memmap_init(size, nid, j, zone_start_pfn);
3877                 zone_start_pfn += size;
3878         }
3879 }

ii.页描述符数组初始化
mem_map指向pg_data_t->node_mem_map

3015 #ifndef __HAVE_ARCH_MEMMAP_INIT
3016 #define memmap_init(size, nid, zone, start_pfn) \
3017         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3018 #endif
2943 /*
2944  * Initially all pages are reserved - free ones are freed
2945  * up by free_all_bootmem() once the early boot process is
2946  * done. Non-atomic initialization, single-pass.
2947  */
2948 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2949                 unsigned long start_pfn, enum memmap_context context)
2950 {
2951         struct page *page;
2952         unsigned long end_pfn = start_pfn + size;
2953         unsigned long pfn;
2954         struct zone *z;
2955 
2956         if (highest_memmap_pfn < end_pfn - 1)
2957                 highest_memmap_pfn = end_pfn - 1;
2958 
2959         z = &NODE_DATA(nid)->node_zones[zone];
2960         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2961                 /*
2962                  * There can be holes in boot-time mem_map[]s
2963                  * handed to this function.  They do not
2964                  * exist on hotplugged memory.
2965                  */
2966                 if (context == MEMMAP_EARLY) {
2967                         if (!early_pfn_valid(pfn))
2968                                 continue;
2969                         if (!early_pfn_in_nid(pfn, nid))
2970                                 continue;
2971                 }
2972                 page = pfn_to_page(pfn);
2973                 set_page_links(page, zone, nid, pfn);
2974                 mminit_verify_page_links(page, zone, nid, pfn);
2975                 init_page_count(page);
2976                 reset_page_mapcount(page);
2977                 SetPageReserved(page);
2978                 /*
2979                  * Mark the block movable so that blocks are reserved for
2980                  * movable at startup. This will force kernel allocations
2981                  * to reserve their blocks rather than leaking throughout
2982                  * the address space during boot when many long-lived
2983                  * kernel allocations are made. Later some blocks near
2984                  * the start are marked MIGRATE_RESERVE by
2985                  * setup_zone_migrate_reserve()
2986                  *
2987                  * bitmap is created for zone's valid pfn range. but memmap
2988                  * can be created for invalid pages (for alignment)
2989                  * check here not to call set_pageblock_migratetype() against
2990                  * pfn out of zone.
2991                  */
2992                 if ((z->zone_start_pfn <= pfn)
2993                     && (pfn < z->zone_start_pfn + z->spanned_pages)
2994                     && !(pfn & (pageblock_nr_pages - 1)))
2995                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2986                  *
2987                  * bitmap is created for zone's valid pfn range. but memmap
2988                  * can be created for invalid pages (for alignment)
2989                  * check here not to call set_pageblock_migratetype() against
2990                  * pfn out of zone.
2991                  */
2992                 if ((z->zone_start_pfn <= pfn)
2993                     && (pfn < z->zone_start_pfn + z->spanned_pages)
2994                     && !(pfn & (pageblock_nr_pages - 1)))
2995                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2996 
2997                 INIT_LIST_HEAD(&page->lru);
2998 #ifdef WANT_PAGE_VIRTUAL
2999                 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
3000                 if (!is_highmem_idx(zone))
3001                         set_page_address(page, __va(pfn << PAGE_SHIFT));
3002 #endif
3003         }
3004 }

iii.区域描述符初始化

3305 __meminit int init_currently_empty_zone(struct zone *zone,
3306                                         unsigned long zone_start_pfn,
3307                                         unsigned long size,
3308                                         enum memmap_context context)
3309 {
3310         struct pglist_data *pgdat = zone->zone_pgdat;
3311         int ret;
3312         ret = zone_wait_table_init(zone, size);
3313         if (ret)
3314                 return ret;
3315         pgdat->nr_zones = zone_idx(zone) + 1;
3316 
3317         zone->zone_start_pfn = zone_start_pfn;
3318 
3319         mminit_dprintk(MMINIT_TRACE, "memmap_init",
3320                         "Initialising map node %d zone %lu pfns %lu -> %lu\n",
3321                         pgdat->node_id,
3321                         pgdat->node_id,
3322                         (unsigned long)zone_idx(zone),
3323                         zone_start_pfn, (zone_start_pfn + size));
3324 
3325         zone_init_free_lists(zone);
3326 
3327         return 0;
3328 }
3006 static void __meminit zone_init_free_lists(struct zone *zone)
3007 {
3008         int order, t;
3009         for_each_migratetype_order(order, t) {
3010                 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
3011                 zone->free_area[order].nr_free = 0;
3012         }
3013 }

III.伙伴系统buddy system
外部碎片：频繁的申请与释放不同大小的页帧块，很容易出现很多小空闲页帧块分散在已分配内存块间隙中；导致不能再分配足够大的页帧块，尽管空闲页帧总数很大；比如每隔小于1024距离均有分配出去的内存块，则再也不能取到连续1024个页帧。
伙伴系统主要用于解决外部碎片问题；尽管分配出去的页帧块可能比请求的大，但是不会超出2倍。
linux伙伴系统默认采用11级(0,1,..10)空闲链表，用于将每级空闲页帧块(首个页描述符)串联起来；每级页帧块大小为1<<k(k是级别)，页帧块起始地址是1<<k的整数倍；最低级页帧块是一个页帧，最高级页帧块是1<<10个页帧
上级页帧块包含一对下级伙伴页帧块：一个上级页帧块可以拆分成两个下级伙伴页帧块，两个下级伙伴页帧块可以合并成一个上级页帧块
在内存分配过程中，对过大的块做伙伴拆分操作，并将分离的伙伴下级加入到相应级别的空闲链表中，直到不能再拆分为止；
在内存回收过程中，做下级伙伴合并操作，直到最高级或不能再合并为止。

i.分配内存块
内存块分配过程中，从小到大依次搜索空闲内存块链表，直到找到空闲内存块或不存在空闲内存块；
如果找到的内存块与申请大小正好相等，则直接返回内存块首个页帧对应的页描述符；
如果不相等，将超出部分的递归除2分割后加到相应的空闲链表中。
比如查找到8个连续页帧[12345678]，申请大小为2；首先将[5678]加到order=2的空闲链表中，再将[34]加到order=1的空闲链表中；返回[1]的页描述符。

__rmqueue用于查找区域内的空闲块，成功返回空闲块首页帧的页描述符；失败返回NULL。在调用__rmqueue之前，获取区域锁以保护伙伴系统数据。

 643 /*
 644  * The order of subdivision here is critical for the IO subsystem.
 645  * Please do not alter this order without good reasons and regression
 646  * testing. Specifically, as large blocks of memory are subdivided,
 647  * the order in which smaller blocks are delivered depends on the order
 648  * they're subdivided in this function. This is the primary factor
 649  * influencing the order in which pages are delivered to the IO
 650  * subsystem according to empirical testing, and this is also justified
 651  * by considering the behavior of a buddy system containing a single
 652  * large block of memory acted on by a series of small allocations.
 653  * This behavior is a critical factor in sglist merging's success.
 654  *
 655  * -- wli
 656  */
 657 static inline void expand(struct zone *zone, struct page *page,
 658         int low, int high, struct free_area *area,
 659         int migratetype)
 660 {
 661         unsigned long size = 1 << high;
 662 
 663         while (high > low) {
 664                 area--;
 665                 high--;
 666                 size >>= 1;
 667                 VM_BUG_ON(bad_range(zone, &page[size]));
 668                 list_add(&page[size].lru, &area->free_list[migratetype]);
 669                 area->nr_free++;
 670                 set_page_order(&page[size], high);
 671         }
 672 }
 714 /*
 715  * Go through the free lists for the given migratetype and remove
 716  * the smallest available page from the freelists
 717  */
 718 static inline
 719 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 720                                                 int migratetype)
 721 {
 722         unsigned int current_order;
 723         struct free_area * area;
 724         struct page *page;
 725 
 726         /* Find a page of the appropriate size in the preferred list */
 727         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 728                 area = &(zone->free_area[current_order]);
 729                 if (list_empty(&area->free_list[migratetype]))
 730                         continue;
 731 
 732                 page = list_entry(area->free_list[migratetype].next,
 733                                                         struct page, lru);
 734                 list_del(&page->lru);
 735                 rmv_page_order(page);
 736                 area->nr_free--;
 737                 expand(zone, page, order, current_order, area, migratetype);
 738                 return page;
 739         }
 740 
 741         return NULL;
 742 }
 907 /*
 908  * Do the hard work of removing an element from the buddy allocator.
 909  * Call me with the zone->lock already held.
 910  */
 911 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 912                                                 int migratetype)
 913 {
 914         struct page *page;
 915 
 916 retry_reserve:
 917         page = __rmqueue_smallest(zone, order, migratetype);
 918 
 919         if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 920                 page = __rmqueue_fallback(zone, order, migratetype);
 921 
 922                 /*
 923                  * Use MIGRATE_RESERVE rather than fail an allocation. goto
 924                  * is used because __rmqueue_smallest is an inline function
 925                  * and we want just one call site
 926                  */
 927                 if (!page) {
 928                         migratetype = MIGRATE_RESERVE;
 929                         goto retry_reserve;
 930                 }
 931         }
 932 
 933         trace_mm_page_alloc_zone_locked(page, order, migratetype);
 934         return page;
 935 }

1.从小到大找到第一个满足大小的空闲块；将空闲块从空闲链表中删除，并将空闲块数据减1
2.如果找到的空闲块大小大于申请大小(是申请大小2的指数倍)，则将空闲块大小除2后，将后半部加入到对应的空闲块链表中

ii.回收页帧块
回收页帧块主要做伙伴合并，将伙伴从对空闲应链表中删除，并将合并后的页帧块加入到相应的空闲链表中。
1.伙伴判断

include/linux/page-flags.h：
139 #define TESTPAGEFLAG(uname, lname)                                      \
140 static inline int Page##uname(struct page *page)                        \
141                         { return test_bit(PG_##lname, &page->flags); }
235 __PAGEFLAG(Buddy, buddy)
mm/page_alloc.c:
 396 /*
 397  * This function checks whether a page is free && is the buddy
 398  * we can do coalesce a page and its buddy if
 399  * (a) the buddy is not in a hole &&
 400  * (b) the buddy is in the buddy system &&
 401  * (c) a page and its buddy have the same order &&
 402  * (d) a page and its buddy are in the same zone.
 403  *
 404  * For recording whether a page is in the buddy system, we use PG_buddy.
 405  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
 406  *
 407  * For recording page's order, we use page_private(page).
 408  */
 409 static inline int page_is_buddy(struct page *page, struct page *buddy,
 410                                                                 int order)
 411 {
 412         if (!pfn_valid_within(page_to_pfn(buddy)))
 413                 return 0;
 414 
 415         if (page_zone_id(page) != page_zone_id(buddy))
 416                 return 0;
 417 
 418         if (PageBuddy(buddy) && page_order(buddy) == order) {
 419                 VM_BUG_ON(page_count(buddy) != 0);
 420                 return 1;
 421         }
 422         return 0;
 423 }

判断伙伴的条件：
a.首页帧对应有物理页帧
b.相同区域
c.相同大小
d.属于伙伴系统

2.伙伴查找

 365 /*
 366  * Locate the struct page for both the matching buddy in our
 367  * pair (buddy1) and the combined O(n+1) page they form (page).
 368  *
 369  * 1) Any buddy B1 will have an order O twin B2 which satisfies
 370  * the following equation:
 371  *     B2 = B1 ^ (1 << O)
 372  * For example, if the starting buddy (buddy2) is #8 its order
 373  * 1 buddy is #10:
 374  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 375  *
 376  * 2) Any buddy B will have an order O+1 parent P which
 377  * satisfies the following equation:
 378  *     P = B & ~(1 << O)
 379  *
 380  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 381  */
 382 static inline struct page *
 383 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
 384 {
 385         unsigned long buddy_idx = page_idx ^ (1 << order);
 386 
 387         return page + (buddy_idx - page_idx);
 388 }
 389 
 390 static inline unsigned long
 391 __find_combined_index(unsigned long page_idx, unsigned int order)
 392 {
 393         return (page_idx & ~(1 << order));
 394 }

__page_find_buddy查找出伙伴页帧块首页描述符；伙伴索引的1<<order位互反，如果为1则伙伴为0，如果为0则伙伴为1；将1<<order位取反后，就可得到伙伴索引
__find_combined_index查找出合并后页帧块首页帧索引；page_idx的1<<order位取0后即为合并后的首页帧索引

3.伙伴合并

 425 /*
 426  * Freeing function for a buddy system allocator.
 427  *
 428  * The concept of a buddy system is to maintain direct-mapped table
 429  * (containing bit values) for memory blocks of various "orders".
 430  * The bottom level table contains the map for the smallest allocatable
 431  * units of memory (here, pages), and each level above it describes
 432  * pairs of units from the levels below, hence, "buddies".
 433  * At a high level, all that happens here is marking the table entry
 434  * at the bottom level available, and propagating the changes upward
 435  * as necessary, plus some accounting needed to play nicely with other
 436  * parts of the VM system.
 437  * At each level, we keep a list of pages, which are heads of continuous
 438  * free pages of length of (1 << order) and marked with PG_buddy. Page's
 439  * order is recorded in page_private(page) field.
 440  * So when we are allocating or freeing one, we can derive the state of the
 441  * other.  That is, if we allocate a small block, and both were   
 442  * free, the remainder of the region must be split into blocks.   
 443  * If a block is freed, and its buddy is also free, then this
 444  * triggers coalescing into a block of larger size.            
 445  *
 446  * -- wli
 447  */
 448 
 449 static inline void __free_one_page(struct page *page,
 450                 struct zone *zone, unsigned int order,
 451                 int migratetype)
 452 {
 453         unsigned long page_idx;
 454 
 455         if (unlikely(PageCompound(page)))
 456                 if (unlikely(destroy_compound_page(page, order)))
 457                         return;
 458 
 459         VM_BUG_ON(migratetype == -1);
 460 
 461         page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 462 
 463         VM_BUG_ON(page_idx & ((1 << order) - 1));
 464         VM_BUG_ON(bad_range(zone, page));
 465 
 466         while (order < MAX_ORDER-1) {
 467                 unsigned long combined_idx;
 468                 struct page *buddy;
 469 
 470                 buddy = __page_find_buddy(page, page_idx, order);
 471                 if (!page_is_buddy(page, buddy, order))
 472                         break;
 473 
 474                 /* Our buddy is free, merge with it and move up one order. */
 475                 list_del(&buddy->lru);
 476                 zone->free_area[order].nr_free--;
 477                 rmv_page_order(buddy);
 478                 combined_idx = __find_combined_index(page_idx, order);
 479                 page = page + (combined_idx - page_idx);
 480                 page_idx = combined_idx;
 481                 order++;
 482         }
 483         set_page_order(page, order);
 484         list_add(&page->lru,
 485                 &zone->free_area[order].free_list[migratetype]);
 486         zone->free_area[order].nr_free++;
 487 }

递归查找伙伴直到不能再找到或达到最大值，找到后将伙伴从空闲链表中删除，并做伙伴合并；将最终合并的页帧块加入到相应的空闲链表中

iii.bootmem内存释放

start_kernel->mm_init->mem_init->free_all_bootmem->free_all_bootmem_core->__free_pages_bootmem->__free_pages->free_one_page
 614 /*
 615  * permit the bootmem allocator to evade page validation on high-order frees
 616  */
 617 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 618 {
 619         if (order == 0) {
 620                 __ClearPageReserved(page);
 621                 set_page_count(page, 0);
 622                 set_page_refcounted(page);
 623                 __free_page(page);
 624         } else {
 625                 int loop;
 626 
 627                 prefetchw(page);
 628                 for (loop = 0; loop < BITS_PER_LONG; loop++) {
 629                         struct page *p = &page[loop];
 630 
 631                         if (loop + 1 < BITS_PER_LONG)
 632                                 prefetchw(p + 1);
 633                         __ClearPageReserved(p);
 634                         set_page_count(p, 0);
 635                 }
 636 
 637                 set_page_refcounted(page);
 638                 __free_pages(page, order);
 639         }
 640 }
 583 static void __free_pages_ok(struct page *page, unsigned int order)
 584 {
 585         unsigned long flags;
 586         int i;
 587         int bad = 0;
 588         int wasMlocked = __TestClearPageMlocked(page);
 589 
 590         kmemcheck_free_shadow(page, order);
 591 
 592         for (i = 0 ; i < (1 << order) ; ++i)
 593                 bad += free_pages_check(page + i);
 594         if (bad)
 595                 return;
 596 
 597         if (!PageHighMem(page)) {
 598                 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 599                 debug_check_no_obj_freed(page_address(page),
 600                                            PAGE_SIZE << order);
 601         }
 602         arch_free_page(page, order);
 603         kernel_map_pages(page, 1 << order, 0);
 604 
 605         local_irq_save(flags);
 606         if (unlikely(wasMlocked))
 607                 free_page_mlock(page);
 608         __count_vm_events(PGFREE, 1 << order);
 609         free_one_page(page_zone(page), page, order,
 610                                         get_pageblock_migratetype(page));
 611         local_irq_restore(flags);
 612 }

在节点描述符初始化时将所有的页帧标识为保留；bootmem释放空闲内存时，将页帧描述符标识为空闲，并释放到伙伴系统中或per-CPU缓存中

III.per-CPU页帧缓存
内核经常申请和释放单个页帧，每个zone分配一个per-CPU页帧缓存用来提高性能。
per-CPU页帧缓存中存在预先从伙伴系统或bootmem中分配的页帧，供本CPU申请。
i.数据结构

include/linux/mmzone.h：
 169 struct per_cpu_pages {
 170         int count;              /* number of pages in the list */
 171         int high;               /* high watermark, emptying needed */
 172         int batch;              /* chunk size for buddy add/remove */
 173 
 174         /* Lists of pages, one per migrate type stored on the pcp-lists */
 175         struct list_head lists[MIGRATE_PCPTYPES];
 176 };
 177 
 178 struct per_cpu_pageset {
 179         struct per_cpu_pages pcp;
 180 #ifdef CONFIG_NUMA
 181         s8 expire;
 182 #endif
 183 #ifdef CONFIG_SMP
 184         s8 stat_threshold;
 185         s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
 186 #endif
 187 } ____cacheline_aligned_in_smp;

count：缓存中可以分配的页帧数量
high：空闲页帧上限
batch：缓存中没有页帧时从伙伴系统一次分配的页帧数量；缓存超过high时，释放到伙伴系统中页帧数量
lists：页帧链表

ii.per-CPU页帧缓存分配

1175 /*
1176  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
1177  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
1178  * or two.
1179  */
1180 static inline
1181 struct page *buffered_rmqueue(struct zone *preferred_zone,
1182                         struct zone *zone, int order, gfp_t gfp_flags,
1183                         int migratetype)
1184 {
1185         unsigned long flags;
1186         struct page *page;
1187         int cold = !!(gfp_flags & __GFP_COLD);
1188         int cpu;
1189 
1190 again:
1191         cpu  = get_cpu();
1192         if (likely(order == 0)) {
1193                 struct per_cpu_pages *pcp;
1194                 struct list_head *list;
1195 
1196                 pcp = &zone_pcp(zone, cpu)->pcp;
1197                 list = &pcp->lists[migratetype];
1198                 local_irq_save(flags);
1199                 if (list_empty(list)) {
1200                         pcp->count += rmqueue_bulk(zone, 0,
1201                                         pcp->batch, list,
1202                                         migratetype, cold);
1203                         if (unlikely(list_empty(list)))
1204                                 goto failed;
1205                 }
1206 
1207                 if (cold)
1208                         page = list_entry(list->prev, struct page, lru);
1209                 else
1210                         page = list_entry(list->next, struct page, lru);
1211 
1212                 list_del(&page->lru);
1213                 pcp->count--;
1214         } else {
1215                 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1216                         /*
1217                          * __GFP_NOFAIL is not to be used in new code.
1218                          *
1219                          * All __GFP_NOFAIL callers should be fixed so that they
1220                          * properly detect and handle allocation failures.
1221                          *
1222                          * We most definitely don't want callers attempting to
1223                          * allocate greater than order-1 page units with
1224                          * __GFP_NOFAIL.
1225                          */
1226                         WARN_ON_ONCE(order > 1);
1227                 }
1228                 spin_lock_irqsave(&zone->lock, flags);
1229                 page = __rmqueue(zone, order, migratetype);
1230                 spin_unlock(&zone->lock);
1231                 if (!page)
1232                         goto failed;
1233                 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1234         }
1235 
1236         __count_zone_vm_events(PGALLOC, zone, 1 << order);
1237         zone_statistics(preferred_zone, zone);
1238         local_irq_restore(flags);
1239         put_cpu();
1240 
1241         VM_BUG_ON(bad_range(zone, page));
1242         if (prep_new_page(page, order, gfp_flags))
1243                 goto again;
1244         return page;
1245 
1246 failed:
1247         local_irq_restore(flags);
1248         put_cpu();
1249         return NULL;
1250 }

当申请一个页帧order=0时从per-CPU中取空闲页；
如果per-CPU缓存中没有页帧时，从伙伴系统中申请pcp->batch个页帧添加到per-CPU缓存中；
从缓存中删除申请的页帧，并返回

iii.per-CPU页帧缓存回收

1078 /*
1079  * Free a 0-order page
1080  */
1081 static void free_hot_cold_page(struct page *page, int cold)
1082 {
1083         struct zone *zone = page_zone(page);
1084         struct per_cpu_pages *pcp;
1085         unsigned long flags;
1086         int migratetype;
1087         int wasMlocked = __TestClearPageMlocked(page);
1088 
1089         kmemcheck_free_shadow(page, 0);
1090 
1091         if (PageAnon(page))
1092                 page->mapping = NULL;
1093         if (free_pages_check(page))
1094                 return;
1095 
1096         if (!PageHighMem(page)) {
1097                 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1098                 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1099         }
1100         arch_free_page(page, 0);
1101         kernel_map_pages(page, 1, 0);
1102 
1103         pcp = &zone_pcp(zone, get_cpu())->pcp;
1104         migratetype = get_pageblock_migratetype(page);
1105         set_page_private(page, migratetype);
1106         local_irq_save(flags);
1107         if (unlikely(wasMlocked))
1108                 free_page_mlock(page);
1109         __count_vm_event(PGFREE);
1110 
1111         /*
1112          * We only track unmovable, reclaimable and movable on pcp lists.
1113          * Free ISOLATE pages back to the allocator because they are being
1114          * offlined but treat RESERVE as movable pages so we can get those
1115          * areas back if necessary. Otherwise, we may have to free
1116          * excessively into the page allocator
1117          */
1118         if (migratetype >= MIGRATE_PCPTYPES) {
1119                 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1120                         free_one_page(zone, page, 0, migratetype);
1121                         goto out;
1122                 }
1123                 migratetype = MIGRATE_MOVABLE;
1124         }
1125 
1126         if (cold)
1127                 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1128         else
1129                 list_add(&page->lru, &pcp->lists[migratetype]);
1130         pcp->count++;
1131         if (pcp->count >= pcp->high) {
1132                 free_pcppages_bulk(zone, pcp->batch, pcp);
1133                 pcp->count -= pcp->batch;
1134         }
1135 
1136 out:
1137         local_irq_restore(flags);
1138         put_cpu();
1139 }
2019 void __free_pages(struct page *page, unsigned int order)
2020 {
2021         if (put_page_testzero(page)) {
2022                 trace_mm_page_free_direct(page, order);
2023                 if (order == 0)
2024                         free_hot_page(page);
2025                 else
2026                         __free_pages_ok(page, order);
2027         }
2028 }

当释放一个页帧order=0时，将页帧释放到per-CPU页帧缓存中；
如果缓存数量超出pcp->high，则将页帧释放到伙伴系统中

III.Zone Allocator
zone分配器是内核页帧分配器的入口；首先根据zonelist确定从哪个zone中分配内存，确定zone后从该zone的per-CPU缓存或伙伴系统中分配页帧；
i.zone内存分配
1、zone内存分配

1935 /*
1936  * This is the 'heart' of the zoned buddy allocator.
1937  */
1938 struct page *
1939 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1940                         struct zonelist *zonelist, nodemask_t *nodemask)
1941 {
1942         enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1943         struct zone *preferred_zone;
1944         struct page *page;
1945         int migratetype = allocflags_to_migratetype(gfp_mask);
1946 
1947         gfp_mask &= gfp_allowed_mask;
1948 
1949         lockdep_trace_alloc(gfp_mask);
1950 
1951         might_sleep_if(gfp_mask & __GFP_WAIT);
1952 
1953         if (should_fail_alloc_page(gfp_mask, order))
1954                 return NULL;
1955 
1956         /*
1957          * Check the zones suitable for the gfp_mask contain at least one
1958          * valid zone. It's possible to have an empty zonelist as a result
1959          * of GFP_THISNODE and a memoryless node
1960          */
1961         if (unlikely(!zonelist->_zonerefs->zone))
1962                 return NULL;
1963 
1964         /* The preferred zone is used for statistics later */
1965         first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1966         if (!preferred_zone)
1967                 return NULL;
1968 
1969         /* First allocation attempt */
1970         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1971                         zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1972                         preferred_zone, migratetype);
1973         if (unlikely(!page))
1974                 page = __alloc_pages_slowpath(gfp_mask, order,
1975                                 zonelist, high_zoneidx, nodemask,
1976                                 preferred_zone, migratetype);
1977 
1978         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1979         return page;
1980 }

首先根据zonelist及gfp_mask确定从哪个zone开始分配内存；
然后从该zone的per-CPU缓存或伙伴系统中分配页帧块

2.根据zonelist与gfp_mask查找zone

include/linux/gfp.h:
149 /*
150  * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
151  * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
152  * and there are 16 of them to cover all possible combinations of
153  * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM
154  *
155  * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
156  * But GFP_MOVABLE is not only a zone specifier but also an allocation
157  * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
158  * Only 1bit of the lowest 3 bit (DMA,DMA32,HIGHMEM) can be set to "1".
159  *
160  *       bit       result
161  *       =================
162  *       0x0    => NORMAL
163  *       0x1    => DMA or NORMAL
164  *       0x2    => HIGHMEM or NORMAL
165  *       0x3    => BAD (DMA+HIGHMEM)
166  *       0x4    => DMA32 or DMA or NORMAL
167  *       0x5    => BAD (DMA+DMA32)
168  *       0x6    => BAD (HIGHMEM+DMA32)
169  *       0x7    => BAD (HIGHMEM+DMA32+DMA)
170  *       0x8    => NORMAL (MOVABLE+0)
171  *       0x9    => DMA or NORMAL (MOVABLE+DMA)
172  *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
173  *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
174  *       0xc    => DMA32 (MOVABLE+HIGHMEM+DMA32)
175  *       0xd    => BAD (MOVABLE+DMA32+DMA)
176  *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
177  *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
178  *
179  * ZONES_SHIFT must be <= 2 on 32 bit platforms.
180  */
181 
182 #if 16 * ZONES_SHIFT > BITS_PER_LONG
183 #error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
184 #endif
185 
186 #define GFP_ZONE_TABLE ( \
187         (ZONE_NORMAL << 0 * ZONES_SHIFT)                                \
188         | (OPT_ZONE_DMA << __GFP_DMA * ZONES_SHIFT)                     \
189         | (OPT_ZONE_HIGHMEM << __GFP_HIGHMEM * ZONES_SHIFT)             \
190         | (OPT_ZONE_DMA32 << __GFP_DMA32 * ZONES_SHIFT)                 \
191         | (ZONE_NORMAL << __GFP_MOVABLE * ZONES_SHIFT)                  \
192         | (OPT_ZONE_DMA << (__GFP_MOVABLE | __GFP_DMA) * ZONES_SHIFT)   \
193         | (ZONE_MOVABLE << (__GFP_MOVABLE | __GFP_HIGHMEM) * ZONES_SHIFT)\
194         | (OPT_ZONE_DMA32 << (__GFP_MOVABLE | __GFP_DMA32) * ZONES_SHIFT)\
195 )
196 
197 /*
198  * GFP_ZONE_BAD is a bitmap for all combination of __GFP_DMA, __GFP_DMA32
199  * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
200  * entry starting with bit 0. Bit is set if the combination is not
201  * allowed.
202  */
203 #define GFP_ZONE_BAD ( \
204         1 << (__GFP_DMA | __GFP_HIGHMEM)                                \
205         | 1 << (__GFP_DMA | __GFP_DMA32)                                \
206         | 1 << (__GFP_DMA32 | __GFP_HIGHMEM)                            \
207         | 1 << (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)                \
208         | 1 << (__GFP_MOVABLE | __GFP_HIGHMEM | __GFP_DMA)              \
209         | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA)                \
210         | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_HIGHMEM)            \
211         | 1 << (__GFP_MOVABLE | __GFP_DMA32 | __GFP_DMA | __GFP_HIGHMEM)\
212 )
213 
214 static inline enum zone_type gfp_zone(gfp_t flags)
215 {
216         enum zone_type z;
217         int bit = flags & GFP_ZONEMASK;
218 
219         z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
220                                          ((1 << ZONES_SHIFT) - 1);
221 
222         if (__builtin_constant_p(bit))
223                 MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
224         else {
225 #ifdef CONFIG_DEBUG_VM
226                 BUG_ON((GFP_ZONE_BAD >> bit) & 1);
227 #endif
228         }
229         return z;
230 }
include/linux/mmzone.h
 870 /**
 871  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 872  * @zonelist - The zonelist to search for a suitable zone
 873  * @highest_zoneidx - The zone index of the highest zone to return
 874  * @nodes - An optional nodemask to filter the zonelist with
 875  * @zone - The first suitable zone found is returned via this parameter
 876  *
 877  * This function returns the first zone at or below a given zone index that is
 878  * within the allowed nodemask. The zoneref returned is a cursor that can be
 879  * used to iterate the zonelist with next_zones_zonelist by advancing it by
 880  * one before calling.
 881  */
 882 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 883                                         enum zone_type highest_zoneidx,
 884                                         nodemask_t *nodes,
 885                                         struct zone **zone)
 886 {
 887         return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
 888                                                                 zone);
 889 }
 mm/mmzone.c：
 55 /* Returns the next zone at or below highest_zoneidx in a zonelist */
 56 struct zoneref *next_zones_zonelist(struct zoneref *z,
 57                                         enum zone_type highest_zoneidx,
 58                                         nodemask_t *nodes,
 59                                         struct zone **zone)
 60 {
 61         /*
 62          * Find the next suitable zone to use for the allocation.
 63          * Only filter based on nodemask if it's set
 64          */
 65         if (likely(nodes == NULL))
 66                 while (zonelist_zone_idx(z) > highest_zoneidx)
 67                         z++;
 68         else
 69                 while (zonelist_zone_idx(z) > highest_zoneidx ||
 70                                 (z->zone && !zref_in_nodemask(z, nodes)))
 71                         z++;
 72 
 73         *zone = zonelist_zone(z);
 74         return z;
 75 }

__GFP_DMA,__GFP_HIGHMEM,__GFP_DMA32,__GFP_MOVABLE分别对应gfp_mask低4位的1,2,3,4位，所以这四个标志位任意的组合值不会相同；最多会有15种组合,每种组合占ZONES_SHIFT位.
GFP_ZONE_TABLE：配置gfp_mask中ZONE标识组合值对应的zone
GFP_ZONE_BAD：配置不合法的ZONE标识组合值
zonelist：配置分配内存时使用zone的先后顺序，一般是ZONE_HIGHMEM->ZONE_NORMAL->ZONE_DMA；前一个zone分配失败后才会用后一个zone去分配，如只有ZOME_HIGHMEM和ZOME_NORMAL没有空闲内存时才会从ZOME_DMA中分配
gfp_zone：根据gfp_mask中zone标识（__GFP_DMA,__GFP_HIGHMEM,__GFP_DMA32,__GFP_MOVABLE）和ZONETABLE的配置取出从相应的zone索引；
first_zones_zonelist：找出低于或等于high_zoneidx的第一个zone

ii.zone内存释放

2019 void __free_pages(struct page *page, unsigned int order)
2020 {
2021         if (put_page_testzero(page)) {
2022                 trace_mm_page_free_direct(page, order);
2023                 if (order == 0)
2024                         free_hot_page(page);
2025                 else
2026                         __free_pages_ok(page, order);
2027         }
2028 }

释放页帧到per-CPU缓存或伙伴系统中