Linux内核源代码情景分析-mmap后，文件与虚拟区间建立映射

最新推荐文章于 2022-08-04 16:16:44 发布

原创

最新推荐文章于 2022-08-04 16:16:44 发布 · 1.8k 阅读

1 ·

CC 4.0 BY-SA版权

文章详细介绍了Linux内核中mmap之后的页面映射过程，包括首次访问导致的缺页异常处理、页面换入、页面状态转换（如不活跃脏页面和不活跃干净页面）以及恢复映射的机制。通过深入解析内核源代码，阐述了文件如何从硬盘读入内存，以及页面状态在不同操作下的变化。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、文件映射的页面换入

在mmap后，mmap参考Linux内核源代码情景分析-系统调用mmap()，当这个区间的一个页面首次受到访问时，会由于见面无映射而发生缺页异常，相应的异常处理程序do_no_page()。

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;
	if (!pte_present(entry)) {
		/*
		 * If it truly wasn't present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

由于pte_none(entry)为true，所以执行do_no_page，代码如下：

static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
	unsigned long address, int write_access, pte_t *page_table)
{
	struct page * new_page;
	pte_t entry;

	if (!vma->vm_ops || !vma->vm_ops->nopage)
		return do_anonymous_page(mm, vma, page_table, write_access, address);

	/*
	 * The third argument is "no_share", which tells the low-level code
	 * to copy, not share the page even if sharing is possible.  It's
	 * essentially an early COW detection.
	 */
	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//指向了filemap_nopage
	if (new_page == NULL)	/* no page was available -- SIGBUS */
		return 0;
	if (new_page == NOPAGE_OOM)
		return -1;
	++mm->rss;
	/*
	 * This silly early PAGE_DIRTY setting removes a race
	 * due to the bad i386 page protection. But it's valid
	 * for other architectures too.
	 *
	 * Note that if write_access is true, we either now have
	 * an exclusive copy of the page, or this is a shared mapping,
	 * so we can make it writable and dirty to avoid having to
	 * handle that later.
	 */
	flush_page_to_ram(new_page);
	flush_icache_page(vma, new_page);
	entry = mk_pte(new_page, vma->vm_page_prot);
	if (write_access) {
		entry = pte_mkwrite(pte_mkdirty(entry));
	} else if (page_count(new_page) > 1 &&
		   !(vma->vm_flags & VM_SHARED))
		entry = pte_wrprotect(entry);
	set_pte(page_table, entry);//建立映射
	/* no need to invalidate: a not-present page shouldn't be cached */
	update_mmu_cache(vma, address, entry);
	return 2;	/* Major fault */
}

filemap_nopage，分配一个空闲内存页面并从文件读入相应的页面。

struct page * filemap_nopage(struct vm_area_struct * area,
	unsigned long address, int no_share)
{
	int error;
	struct file *file = area->vm_file;
	struct inode *inode = file->f_dentry->d_inode;
	struct address_space *mapping = inode->i_mapping;//mapping来源于inode->i_mapping
	struct page *page, **hash, *old_page;
	unsigned long size, pgoff;

	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;

retry_all:
	/*
	 * An external ptracer can access pages that normally aren't
	 * accessible..
	 */
	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
	if ((pgoff >= size) && (area->vm_mm == current->mm))
		return NULL;

	/*
	 * Do we have something in the page cache already?
	 */
	hash = page_hash(mapping, pgoff);
retry_find:
	page = __find_get_page(mapping,