x86 PAT原理

简介

PAT的全称为PAGE ATTRIBUTE TABLE,用于页cahe属性。intel文档《IA-32 Intel® Architecture Software Developer’s Manual Volume 3:System Programming Guide》page 403对PAT的定义如下:

The Page Attribute Table (PAT) extends the IA-32 architecture’s page-table format to allow
memory types to be assigned to regions of physical memory based on linear address mappings.

原理分析

内核代码版本为linux-5.4.191。

虚拟地址与物理地址映射时,页表项中有三个bit分别为:PAT、PCD、PWT,它们的值组成了一个索引index,然后通过index在PAT表中选择相应的entry,该entry的值代表cache模式:UC、WC、UC_MINUS、WT、WP、WB。

PAT表

x86有个寄存器叫MSR_IA32_CR_PAT,该寄存器的数据为64位共8个字节,PAT、PCD、PWT组成的index即选中该寄存器数据的第index字节(0、1、2....7),第index字节的值即为cache模式(UC、WC、WT、WP、WB、UC_MINUS)。通过设置MSR_IA32_CR_PAT寄存器的值,即可设置相应index的cache模式。

当系统启动时有如下调用栈:

start_kernel

        ->setup_arch    linux-5.4.191\arch\x86\kernel\setup.c

                ->mtrr_bp_init

                        ->mtrr_bp_pat_init

                                ->pat_init

                                        ->pat_bsp_init

                                                ->wrmsrl(MSR_IA32_CR_PAT, pat);

                                                ->__init_cache_modes

                                                        ->update_cache_mode_entry

函数pat_init即设置PAT表,代码如下:

void pat_init(void)
{
	u64 pat;
	struct cpuinfo_x86 *c = &boot_cpu_data;

	if (pat_disabled)
		return;

	if ((c->x86_vendor == X86_VENDOR_INTEL) &&
	    (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
	     ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
		/*
		 * PAT support with the lower four entries. Intel Pentium 2,
		 * 3, M, and 4 are affected by PAT errata, which makes the
		 * upper four entries unusable. To be on the safe side, we don't
		 * use those.
		 *
		 *  PTE encoding:
		 *      PAT
		 *      |PCD
		 *      ||PWT  PAT
		 *      |||    slot
		 *      000    0    WB : _PAGE_CACHE_MODE_WB
		 *      001    1    WC : _PAGE_CACHE_MODE_WC
		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
		 *      011    3    UC : _PAGE_CACHE_MODE_UC
		 * PAT bit unused
		 *
		 * NOTE: When WT or WP is used, it is redirected to UC- per
		 * the default setup in __cachemode2pte_tbl[].
		 */
		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
	} else {
		/*
		 * Full PAT support.  We put WT in slot 7 to improve
		 * robustness in the presence of errata that might cause
		 * the high PAT bit to be ignored.  This way, a buggy slot 7
		 * access will hit slot 3, and slot 3 is UC, so at worst
		 * we lose performance without causing a correctness issue.
		 * Pentium 4 erratum N46 is an example for such an erratum,
		 * although we try not to use PAT at all on affected CPUs.
		 *
		 *  PTE encoding:
		 *      PAT
		 *      |PCD
		 *      ||PWT  PAT
		 *      |||    slot
		 *      000    0    WB : _PAGE_CACHE_MODE_WB
		 *      001    1    WC : _PAGE_CACHE_MODE_WC
		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
		 *      011    3    UC : _PAGE_CACHE_MODE_UC
		 *      100    4    WB : Reserved
		 *      101    5    WP : _PAGE_CACHE_MODE_WP
		 *      110    6    UC-: Reserved
		 *      111    7    WT : _PAGE_CACHE_MODE_WT
		 *
		 * The reserved slots are unused, but mapped to their
		 * corresponding types in the presence of PAT errata.
		 */
		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
		      PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
	}

	if (!boot_cpu_done) {
		pat_bsp_init(pat);
		boot_cpu_done = true;
	} else {
		pat_ap_init(pat);
	}
}

static void pat_bsp_init(u64 pat)
{
	u64 tmp_pat;

	if (!boot_cpu_has(X86_FEATURE_PAT)) {
		pat_disable("PAT not supported by CPU.");
		return;
	}

	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
	if (!tmp_pat) {
		pat_disable("PAT MSR is 0, disabled.");
		return;
	}

	wrmsrl(MSR_IA32_CR_PAT, pat);
	pat_initialized = true;

	__init_cache_modes(pat);
}

static void __init_cache_modes(u64 pat)
{
	enum page_cache_mode cache;
	char pat_msg[33];
	int i;

	pat_msg[32] = 0;
	for (i = 7; i >= 0; i--) {
		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
					   pat_msg + 4 * i);
		update_cache_mode_entry(i, cache);
	}
	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);

	init_cm_done = true;
}

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLY__
enum page_cache_mode {
	_PAGE_CACHE_MODE_WB = 0,
	_PAGE_CACHE_MODE_WC = 1,
	_PAGE_CACHE_MODE_UC_MINUS = 2,
	_PAGE_CACHE_MODE_UC = 3,
	_PAGE_CACHE_MODE_WT = 4,
	_PAGE_CACHE_MODE_WP = 5,
	_PAGE_CACHE_MODE_NUM = 8
};
#endif

#define CM(c) (_PAGE_CACHE_MODE_ ## c)

static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
{
	enum page_cache_mode cache;
	char *cache_mode;

	switch (pat_val) {
	case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
	case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
	case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
	case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
	case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
	case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
	default:           cache = CM(WB);       cache_mode = "WB  "; break;
	}

	memcpy(msg, cache_mode, 4);

	return cache;
}

void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
	/* entry 0 MUST be WB (hardwired to speed up translations) */
	BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);

	__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
	__pte2cachemode_tbl[entry] = cache;
}

#define __cm_idx2pte(i)					\
	((((i) & 4) << (_PAGE_BIT_PAT - 2)) |		\
	 (((i) & 2) << (_PAGE_BIT_PCD - 1)) |		\
	 (((i) & 1) << _PAGE_BIT_PWT))

变量pat的值最终会被写入寄存器MSR_IA32_CR_PAT,其中宏PAT和相应的宏定义如下,比如PAT(1, WC)即设置变量pat的第1个字节为PAT_WC(值为1):

enum {
    PAT_UC = 0,        /* uncached */
    PAT_WC = 1,        /* Write combining */
    PAT_WT = 4,        /* Write Through */
    PAT_WP = 5,        /* Write Protected */
    PAT_WB = 6,        /* Write Back (default) */
    PAT_UC_MINUS = 7,    /* UC, but can be overridden by MTRR */
};

linux-5.4.191\arch\x86\mm\pat.c

#define PAT(x, y)    ((u64)PAT_ ## y << ((x)*8)) 

随后调用函数pat_bsp_init->wrmsrl(MSR_IA32_CR_PAT, pat)将寄存器MSR_IA32_CR_PAT的值设置为pat。最后调用update_cache_mode_entry设置__cachemode2pte_tbl(cache mode到pte的PAT、PCD、PWT位转换)和__pte2cachemode_tbl(pte的PAT、PCD、PWT组成的index到cache mode的转换,见注释__pte2cachemode_tbl[] are the caching attribute bits of the pte   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2),内核设置cache mode或pte时就会使用这两个表项。它们的初始值如下:

linux-5.4.191\arch\x86\mm\init.c

uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
	[_PAGE_CACHE_MODE_WC      ]	= 0         | _PAGE_PCD,
	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
	[_PAGE_CACHE_MODE_WP      ]	= 0         | _PAGE_PCD,
};
EXPORT_SYMBOL(__cachemode2pte_tbl);

uint8_t __pte2cachemode_tbl[8] = {
	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
EXPORT_SYMBOL(__pte2cachemode_tbl);

PAT的使用

下面代码设置vma的vm_page_prot为uncache:

vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

 宏pgprot_noncached的代码如下,其中函数cachemode2protval通过之前设置的__cachemode2pte_tbl将_PAGE_CACHE_MODE_UC_MINUS转换为了pte的属性,然后与prot按位或操作,最后将结果赋值给vma->vm_page_prot(为什么不先清楚pte的cache属性然后再进行或操作?arm64和loongarch64的pgprot_noncached实现都是先清除cache mask然后再进行或操作。为了组合效果?例如_PAGE_CACHE_MODE_WC的index为001,_PAGE_CACHE_MODE_UC_MINUS的index为010,两个进行或操作后index就变为011即_PAGE_CACHE_MODE_UC):

#define pgprot_noncached(prot)						\
	((boot_cpu_data.x86 > 3)					\
	 ? (__pgprot(pgprot_val(prot) |					\
		     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))	\
	 : (prot))

static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
{
	if (likely(pcm == 0))
		return 0;
	return __cachemode2pte_tbl[pcm];
}

参考文档

《IA-32 Intel® Architecture Software Developer’s Manual Volume 3:System Programming Guide》

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值