内核并发安全：Linux KSM合并操作的并发控制策略-优快云博客

本文链接：https://blog.youkuaiyun.com/kaka__55/article/details/127812173

内核代码中，内存规整会存在用户进程页面迁移的动作，ksm合并会存在页面合并的动作，这些动作和用户态对内存的读写动作是并行执行的，也就是说，可能存在这样的情况：
内核在做页面迁移或者合并的时候，用户态进程并行的对该内存进行读写动作。
那么，对于这种情况，内核是如何保障并发安全的呢？
对于页面迁移的动作，可以参考这篇文章。
本文着重分析ksm合并时对页面并发安全所做的保障。

对于ksm合并时的并发安全，我们需要做到如下的几点：

在内核做合并的时候，若用户态读该页面，需要保证页表以及tlb的更新是正确的
在内核做合并的时候，若用户态写该页面，需要保证合并的正常终止

接下来开始RTFC。直奔重点。代码基于linux内核4.19.195.
ksm的内核扫描进程在扫描页面时，最终会调用到replace_page函数进行页面的合并动作。调用栈如下:
ksm_scan_thread->ksm_do_scan->cmp_and_merge_page->try_to_merge_one_page->replace_page.
replace_page合并传入的kpage以及page。

函数try_to_merge_one_page尝试合并传入的page和kpage。

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
				 struct page *page, struct page *kpage)
{
***

	/*
	 * We need the page lock to read a stable PageSwapCache in
	 * write_protect_page().  We use trylock_page() instead of
	 * lock_page() because we don't want to wait here - we
	 * prefer to continue scanning and merging different pages,
	 * then come back to this page when it is unlocked.
	 */
	if (!trylock_page(page))
		goto out;

***

	/*
	 * If this anonymous page is mapped only here, its pte may need
	 * to be write-protected.  If it's mapped elsewhere, all of its
	 * ptes are necessarily already write-protected.  But in either
	 * case, we need to lock and check page_count is not raised.
	 */
	if (write_protect_page(vma, page, &orig_pte) == 0) {
		if (!kpage) {
			/*
			 * While we hold page lock, upgrade page from
			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
			 * stable_tree_insert() will update stable_node.
			 */
			set_page_stable_node(page, NULL);
			mark_page_accessed(page);
			/*
			 * Page reclaim just frees a clean page with no dirty
			 * ptes: make sure that the ksm page would be swapped.
			 */
			if (!PageDirty(page))
				SetPageDirty(page);
			err = 0;
		} else if (pages_identical(page, kpage)) //再一次比较page和kpage内容是否一致
			err = replace_page(vma, page, kpage, orig_pte); //一致的话调用replace_page
	}

***

out_unlock:
	unlock_page(page);
out:
	return err;
}

write_protect_page函数将传入的page对应的pte修改成写保护的

static int write_protect_page(struct vm_area_struct *vma, struct page *page,
			      pte_t *orig_pte)
{
***

	if (!page_vma_mapped_walk(&pvmw))
		goto out_mn;
	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
		goto out_unlock;

	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
						mm_tlb_flush_pending(mm)) {
		pte_t entry;

		swapped = PageSwapCache(page);
		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
		/*
		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
		 * take any lock, therefore the check that we are going to make
		 * with the pagecount against the mapcount is racey and
		 * O_DIRECT can happen right after the check.
		 * So we clear the pte and flush the tlb before the check
		 * this assure us that no O_DIRECT can happen after the check
		 * or in the middle of the check.
		 *
		 * No need to notify as we are downgrading page table to read
		 * only not changing it to point to a new page.
		 *
		 * See Documentation/vm/mmu_notifier.rst
		 */
		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
		/*
		 * Check that no O_DIRECT or similar I/O is in progress on the
		 * page
		 */
		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
			goto out_unlock;
		}
		if (pte_dirty(entry))
			set_page_dirty(page);

		if (pte_protnone(entry))
			entry = pte_mkclean(pte_clear_savedwrite(entry));
		else
			entry = pte_mkclean(pte_wrprotect(entry));
		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
	}
	*orig_pte = *pvmw.pte;
	err = 0;

out_unlock:
	page_vma_mapped_walk_done(&pvmw);
out_mn:
	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
	return err;
}

replace_page函数完成最终的页面替换。

/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
static int replace_page(struct vm_area_struct *vma, struct page *page,
			struct page *kpage, pte_t orig_pte)
{
	struct mm_struct *mm = vma->vm_mm;
	pmd_t *pmd;
	pte_t *ptep;
	pte_t newpte;
	spinlock_t *ptl;
	unsigned long addr;
	int err = -EFAULT;
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */

	addr = page_address_in_vma(page, vma);
	if (addr == -EFAULT)
		goto out;

	pmd = mm_find_pmd(mm, addr);
	if (!pmd)
		goto out;

	mmun_start = addr;
	mmun_end   = addr + PAGE_SIZE;
	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_same(*ptep, orig_pte)) {
		pte_unmap_unlock(ptep, ptl);
		goto out_mn;
	}

	/*
	 * No need to check ksm_use_zero_pages here: we can only have a
	 * zero_page here if ksm_use_zero_pages was enabled alreaady.
	 */
	if (!is_zero_pfn(page_to_pfn(kpage))) {
		get_page(kpage);
		page_add_anon_rmap(kpage, vma, addr, false);
		newpte = mk_pte(kpage, vma->vm_page_prot);
	} else {
		newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
					       vma->vm_page_prot));
		/*
		 * We're replacing an anonymous page with a zero page, which is
		 * not anonymous. We need to do proper accounting otherwise we
		 * will get wrong values in /proc, and a BUG message in dmesg
		 * when tearing down the mm.
		 */
		dec_mm_counter(mm, MM_ANONPAGES);
	}

	flush_cache_page(vma, addr, pte_pfn(*ptep));
	/*
	 * No need to notify as we are replacing a read only page with another
	 * read only page with the same content.
	 *
	 * See Documentation/vm/mmu_notifier.rst
	 */
	ptep_clear_flush(vma, addr, ptep);
	set_pte_at_notify(mm, addr, ptep, newpte);

	page_remove_rmap(page, false);
	if (!page_mapped(page))
		try_to_free_swap(page);
	put_page(page);

	pte_unmap_unlock(ptep, ptl);
	err = 0;
out_mn:
	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
	return err;
}

代码不长，我们来关注并发安全的实现。
我们先来看函数write_protect_page。他会调用page_vma_mapped_walk将页表上锁，该锁由函数pte_lockptr()获取，在函数map_pte()中上锁。上锁检查没有问题后，回到函数write_protect_page()，进行将页表设置为写保护的动作，最后在函数page_vma_mapped_walk_done()中解锁。
这里上锁的原因是因为，如果在设置了写保护后，用户态对该内存页的写操作将会触发一个page fault进入缺页中断，在缺页中断处理函数中完成页表。而这个锁就能够保证对页表修改的原子性，避免两个进程同时修改从而导致一些并发问题。
完成函数write_protect_page()的工作后，回到函数try_to_merge_one_page()，调用pages_identical()再一次比较page和kpage内容是否一致，确认一致后就调用replace_page函数进行最终的replace动作。
replace_page()函数中，通过函数pte_offset_map_lock()获取页表的锁，完成了与缺页中断的并发保护动作。
大家可能已经发现了，在函数replace_page()以及write_protect_page()之间，存在着一段没有持锁的临界区。如果在这个临界区进行读，那看起来应该问题不大，但是如果在临界区进行写操作呢？因为页表项被改为read only的关系，写操作会触发一个缺页中断，并将页表修改成wirtable的，那么，这时候内核再来合并这两个内存页，岂不就有问题了？
仔细一看，replace_page()函数中，在获取了页表的锁之后，又特地检查了一次if (!pte_same(*ptep, orig_pte))，其中ptep是页表项，orig_pte是原先的页表项值，是从write_protect_page函数获取的。就是这个检查，保证了如果在函数replace_page()以及write_protect_page()之间，有对该页表项的修改动作，replace_page()函数就会报错退出，从而确保了并发安全。至于，为什么是需要持锁后判断，可以参考这篇文章。如果一切顺利的话，replace_page()函数在释放锁之前，就会把page释放给伙伴系统了，释放锁之后，合并的动作就完成了。
通过这个案例，我们能够看出linux对于并发保护的一个模型：
如果两个前后调用的函数均持有同一把锁，而且在两个函数退出前都会释放该锁，那么，如果希望这两个函数所做的动作合并起来是原子的，或者，至少没有并发问题的话，那么，可以在第一个函数的临界区里，获取一个变量的值，如果该变量的值在进入第二个函数的临界区都没有被修改的情况下，就可以保证整体代码的并发安全（如果逻辑上行得通的话）。

ksm页面合并的并发处理