linux3.10 vmalloc的工作机制

本文详细介绍了Linux内核中vmalloc函数的工作原理,包括虚拟内存的申请、页表的建立以及物理内存的映射过程。从__vmalloc_node到__vmalloc_node_range,再到__get_vm_area_node和__vmalloc_area_node,最后通过vmap_page_range_noflush和vmap_pte_range完成虚拟到物理的映射。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

介绍一下vmalloc的整个分配过程:

vmalloc

     -------->__vmalloc_node_flags

         --------->__vmalloc_node

             ------------>__vmalloc_node_range

先看一下__vmalloc_node:

static void *__vmalloc_node(unsigned long size, unsigned long align,
			    gfp_t gfp_mask, pgprot_t prot,
			    int node, const void *caller)
{
	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
				gfp_mask, prot, node, caller);
}

可以看到vmalloc分配的虚拟内存的地址范围是VMALLOC_START ~~~VMALLOC_END

#define VMALLOC_OFFSET		(8*1024*1024)
#define VMALLOC_START		(((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
#define VMALLOC_END		0xff000000UL

vmalloc分配的地址位于高端内存之上,再偏移8M,接着分析__vmalloc_node_range:

void *__vmalloc_node_range(unsigned long size, unsigned long align,
			unsigned long start, unsigned long end, gfp_t gfp_mask,
			pgprot_t prot, int node, const void *caller)
{
	struct vm_struct *area;
	void *addr;
	unsigned long real_size = size;

	size = PAGE_ALIGN(size);
	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
		goto fail;
        分配虚拟地址,并记录在vm_struct 中
	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
				  start, end, node, gfp_mask, caller);
	if (!area)
		goto fail;
        把该虚拟地址映射到一个物理页上
	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
	if (!addr)
		return NULL;

	/*
	 * In this function, newly allocated vm_struct has VM_UNLIST flag.
	 * It means that vm_struct is not fully initialized.
	 * Now, it is fully initialized, so remove this flag here.
	 */
	clear_vm_unlist(area);

	/*
	 * A ref_count = 3 is needed because the vm_struct and vmap_area
	 * structures allocated in the __get_vm_area_node() function contain
	 * references to the virtual address of the vmalloc'ed block.
	 */
	kmemleak_alloc(addr, real_size, 3, gfp_mask);

	return addr;

fail:
	warn_alloc_failed(gfp_mask, 0,
			  "vmalloc: allocation failure: %lu bytes\n",
			  real_size);
	return NULL;
}

先看__get_vm_area_node:

static struct vm_struct *__get_vm_area_node(unsigned long size,
		unsigned long align, unsigned long flags, unsigned long start,
		unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
	struct vmap_area *va;
	struct vm_struct *area;

	BUG_ON(in_interrupt());
	if (flags & VM_IOREMAP) {
		int bit = fls(size);

		if (bit > IOREMAP_MAX_ORDER)
			bit = IOREMAP_MAX_ORDER;
		else if (bit < PAGE_SHIFT)
			bit = PAGE_SHIFT;

		align = 1ul << bit;
	}

	size = PAGE_ALIGN(size);
	if (unlikely(!size))
		return NULL;
            先向slab系统申请一个vm_struct结构
	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!area))
		return NULL;

	/*
	 * We always allocate a guard page.
	 */
	size += PAGE_SIZE;
        然后从空闲的虚拟地址中申请一块虚拟内存,用vmap_area管理
	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
	if (IS_ERR(va)) {
		kfree(area);
		return NULL;
	}

	/*
	 * When this function is called from __vmalloc_node_range,
	 * we add VM_UNLIST flag to avoid accessing uninitialized
	 * members of vm_struct such as pages and nr_pages fields.
	 * They will be set later.
	 */
	if (flags & VM_UNLIST)   把vmap_area和vm_struct关联起来,最终返回vm_struct
		setup_vmalloc_vm(area, va, flags, caller);
	else
		insert_vmalloc_vm(area, va, flags, caller);

	return area;
}

vm_struct通过和page结构相联系用来描述虚拟空间和物理地址的映射关系,而vmap_area用来管理内核的虚拟空间,内核利用红黑树算法,把vmap_area都挂载在全局链表vmap_area_root.rb_node;的红黑树上,以此来加快内存的搜索,所以kernel空间的虚存都是共享的

static struct vmap_area *alloc_vmap_area(unsigned long size,
				unsigned long align,
				unsigned long vstart, unsigned long vend,
				int node, gfp_t gfp_mask)
{
	struct vmap_area *va;
	struct rb_node *n;
	unsigned long addr;
	int purged = 0;
	struct vmap_area *first;

	BUG_ON(!size);
	BUG_ON(size & ~PAGE_MASK);
	BUG_ON(!is_power_of_2(align));

	va = kmalloc_node(sizeof(struct vmap_area),
			gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!va))
		return ERR_PTR(-ENOMEM);

retry:
	spin_lock(&vmap_area_lock);
	/*
	 * Invalidate cache if we have more permissive parameters.
	 * cached_hole_size notes the largest hole noticed _below_
	 * the vmap_area cached in free_vmap_cache: if size fits
	 * into that hole, we want to scan from vstart to reuse
	 * the hole instead of allocating above free_vmap_cache.
	 * Note that __free_vmap_area may update free_vmap_cache
	 * without updating cached_hole_size or cached_align.
	 */
	if (!free_vmap_cache ||
			size < cached_hole_size ||
			vstart < cached_vstart ||
			align < cached_align) {
nocache:
		cached_hole_size = 0;
		free_vmap_cache = NULL;
	}
	/* record if we encounter less permissive parameters */
	cached_vstart = vstart;
	cached_align = align;
        搜索红黑树,在指定的start和end 范围中,找到一块大小为size的空闲内存块
	/* find starting point for our search */
	if (free_vmap_cache) {
		first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
		addr = ALIGN(first->va_end, align);
		if (addr < vstart)
			goto nocache;
		if (addr + size - 1 < addr)
			goto overflow;

	} else {
		addr = ALIGN(vstart, align);
		if (addr + size - 1 < addr)
			goto overflow;

		n = vmap_area_root.rb_node;
		first = NULL;

		while (n) {
			struct vmap_area *tmp;
			tmp = rb_entry(n, struct vmap_area, rb_node);
			if (tmp->va_end >= addr) {
				first = tmp;
				if (tmp->va_start <= addr)
					break;
				n = n->rb_left;
			} else
				n = n->rb_right;
		}

		if (!first)
			goto found;
	}

	/* from the starting point, walk areas until a suitable hole is found */
	while (addr + size > first->va_start && addr + size <= vend) {
		if (addr + cached_hole_size < first->va_start)
			cached_hole_size = first->va_start - addr;
		addr = ALIGN(first->va_end, align);
		if (addr + size - 1 < addr)
			goto overflow;

		if (list_is_last(&first->list, &vmap_area_list))
			goto found;

		first = list_entry(first->list.next,
				struct vmap_area, list);
	}

found:
	if (addr + size > vend)
		goto overflow;

	va->va_start = addr;
	va->va_end = addr + size;
	va->flags = 0;
        把该找到的符合条件的内内存块插入红黑树,标记为使用
	__insert_vmap_area(va);
	free_vmap_cache = &va->rb_node;
	spin_unlock(&vmap_area_lock);

	BUG_ON(va->va_start & (align-1));
	BUG_ON(va->va_start < vstart);
	BUG_ON(va->va_end > vend);

	return va;

overflow:
	spin_unlock(&vmap_area_lock);
	if (!purged) {
		purge_vmap_area_lazy();
		purged = 1;
		goto retry;
	}
	if (printk_ratelimit())
		printk(KERN_WARNING
			"vmap allocation for size %lu failed: "
			"use vmalloc=<size> to increase size.\n", size);
	kfree(va);
	return ERR_PTR(-EBUSY);
}

上面函数就是搜索红黑树,找到满足条件的空闲内存块,然后调用setup_vmalloc_vm,把vmap_area中的地址设置到vm_struct中,并把两者关联起来。再看__vmalloc_area_node,为虚拟地址和物理地址建立映射关系:

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
				 pgprot_t prot, int node, const void *caller)
{
	const int order = 0;
	struct page **pages;
	unsigned int nr_pages, array_size, i;
	gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;

	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
	array_size = (nr_pages * sizeof(struct page *));

	area->nr_pages = nr_pages;
	/* Please note that the recursion is strictly bounded. */
	if (array_size > PAGE_SIZE) {  大于一页,用vmalloc机制申请内存,变成递归调用了
		pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
				PAGE_KERNEL, node, caller);
		area->flags |= VM_VPAGES;
	} else {
                小于一页,利用slab机制分配内存
		pages = kmalloc_node(array_size, nested_gfp, node);
	}
	area->pages = pages;
	area->caller = caller;
	if (!area->pages) {
		remove_vm_area(area->addr);
		kfree(area);
		return NULL;
	}

	for (i = 0; i < area->nr_pages; i++) {
		struct page *page;
		gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;

		if (node < 0)   每次向伙伴系统申请一个页
			page = alloc_page(tmp_mask);
		else
			page = alloc_pages_node(node, tmp_mask, order);

		if (unlikely(!page)) {
			/* Successfully allocated i pages, free them in __vunmap() */
			area->nr_pages = i;
			goto fail;
		}
		area->pages[i] = page;
	}
        把上面申请的页建立页表
	if (map_vm_area(area, prot, &pages))
		goto fail;
	return area->addr;

fail:
	warn_alloc_failed(gfp_mask, order,
			  "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
			  (area->nr_pages*PAGE_SIZE), area->size);
	vfree(area->addr);
	return NULL;
}

上面函数向伙伴系统申请物理页面,每次只申请一个页面,所以可能会导致物理内存不连续,然后调用map_vm_area建立页表,把虚拟地址和物理地址映射起来:

map_vm_area

      -------->vmap_page_range

          ----------->vmap_page_range_noflush

static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				   pgprot_t prot, struct page **pages)
{
	pgd_t *pgd;
	unsigned long next;
	unsigned long addr = start;
	int err = 0;
	int nr = 0;

	BUG_ON(addr >= end);
	pgd = pgd_offset_k(addr);根据虚拟地址先算出页目录地址
	do {
		next = pgd_addr_end(addr, end);算出下一个页目录地址
		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);填写页目录和页表
		if (err)
			return err;
	} while (pgd++, addr = next, addr != end);

	return nr;
}

 需要注意的是vmalloc分配的页表,都是填写在init进程的页目录里的,其他进程在内核态访问该块空间时,都是在缺页异常中完成的,在缺页异常中会把init进程的页表复制到进程的页表中

index = pgd_index(addr);
 
	pgd = cpu_get_pgd() + index; //从页表寄存器获取出错的页目录地址
	pgd_k = init_mm.pgd + index;//获取init进程的页目录地址
 
	if (pgd_none(*pgd_k))
		goto bad_area;
	if (!pgd_present(*pgd))
		set_pgd(pgd, *pgd_k);
 
	pud = pud_offset(pgd, addr);
	pud_k = pud_offset(pgd_k, addr);
 
	if (pud_none(*pud_k))
		goto bad_area;
	if (!pud_present(*pud))
		set_pud(pud, *pud_k);
 
	pmd = pmd_offset(pud, addr);
	pmd_k = pmd_offset(pud_k, addr);
 
#ifdef CONFIG_ARM_LPAE
	/*
	 * Only one hardware entry per PMD with LPAE.
	 */
	index = 0;
#else
	/*
	 * On ARM one Linux PGD entry contains two hardware entries (see page
	 * tables layout in pgtable.h). We normally guarantee that we always
	 * fill both L1 entries. But create_mapping() doesn't follow the rule.
	 * It can create inidividual L1 entries, so here we have to call
	 * pmd_none() check for the entry really corresponded to address, not
	 * for the first of pair.
	 */
	index = (addr >> SECTION_SHIFT) & 1;
#endif
	if (pmd_none(pmd_k[index]))
		goto bad_area;
 
	copy_pmd(pmd, pmd_k);

32位linux系统pmd和pud都是等于页目录的,所以直接看最后的调用函数:

vmap_pud_range

     -------->vmap_pmd_range

         --------->vmap_pte_range

static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
	pte_t *pte;

	/*
	 * nr is a running index into the array which helps higher level
	 * callers keep track of where we're up to.
	 */

	pte = pte_alloc_kernel(pmd, addr); pmd是页目录地址,根据页目录地址得到其指向的页表项地址
	if (!pte)
		return -ENOMEM;
	do {
		struct page *page = pages[*nr];

		if (WARN_ON(!pte_none(*pte)))
			return -EBUSY;
		if (WARN_ON(!page))
			return -ENOMEM;
		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 填充页表,完成映射
		(*nr)++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
	return 0;
}

上面函数先调用pte_alloc_kernel为得到具体的页地址,这边分为两步,如果之前已经填充过页目录,那么页目录指向的页表项已经存在,只是页中的内容可以还没有映射到真正的物理地址,则直接拿到该页的地址等下一步进行填充,还有一种可能是页目录还没有被填充,则先分配一个内存页作为页表项,并把该地址填充到页目录中,接着根据页表项内偏移搜索该页表项中的具体页地址,并把该页地址返回,等待进一步填充完成映射。

#define pte_alloc_kernel(pmd, address)            \
    ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
        NULL: pte_offset_kernel(pmd, address))

int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
	pte_t *new = pte_alloc_one_kernel(&init_mm, address);申请了4K空间,用来放页表
	if (!new)
		return -ENOMEM;

	smp_wmb(); /* See comment in __pte_alloc */

	spin_lock(&init_mm.page_table_lock);
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		pmd_populate_kernel(&init_mm, pmd, new);填充页目录
		new = NULL;
	} else
		VM_BUG_ON(pmd_trans_splitting(*pmd));
	spin_unlock(&init_mm.page_table_lock);
	if (new)
		pte_free_kernel(&init_mm, new);
	return 0;
}

上面函数申请了一页,4K 空间用来放页表,其中2K空间用来linux系统自己维护,剩下的2K,总共有512个页,一个1M的页目录对应256个页,所以一下可以填充两个页目录,在pmd_populate_kernel中确实填充了两个页目录,具体页和页目录的映射关系如果忘了的话,再回顾下这篇文章:

https://blog.youkuaiyun.com/oqqYuJi12345678/article/details/96029177

pte_alloc_kernel返回页表的地址,然后利用mk_pte(page, prot)函数为页表把物理地址和标志位结合起来,最后调用set_pte_at把这个物理地址填写到页表中,至此,虚拟映射完成。vmalloc返回的虚拟地址已经可以使用了。

其他一些详解,可以参照这篇博文:

https://blog.youkuaiyun.com/chenying126/article/details/78511617

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值