简介
PAT的全称为PAGE ATTRIBUTE TABLE,用于页cahe属性。intel文档《IA-32 Intel® Architecture Software Developer’s Manual Volume 3:System Programming Guide》page 403对PAT的定义如下:
The Page Attribute Table (PAT) extends the IA-32 architecture’s page-table format to allow
memory types to be assigned to regions of physical memory based on linear address mappings.
原理分析
内核代码版本为linux-5.4.191。
虚拟地址与物理地址映射时,页表项中有三个bit分别为:PAT、PCD、PWT,它们的值组成了一个索引index,然后通过index在PAT表中选择相应的entry,该entry的值代表cache模式:UC、WC、UC_MINUS、WT、WP、WB。
PAT表
x86有个寄存器叫MSR_IA32_CR_PAT,该寄存器的数据为64位共8个字节,PAT、PCD、PWT组成的index即选中该寄存器数据的第index字节(0、1、2....7),第index字节的值即为cache模式(UC、WC、WT、WP、WB、UC_MINUS)。通过设置MSR_IA32_CR_PAT寄存器的值,即可设置相应index的cache模式。
当系统启动时有如下调用栈:
start_kernel
->setup_arch linux-5.4.191\arch\x86\kernel\setup.c
->mtrr_bp_init
->mtrr_bp_pat_init
->pat_init
->pat_bsp_init
->wrmsrl(MSR_IA32_CR_PAT, pat);
->__init_cache_modes
->update_cache_mode_entry
函数pat_init即设置PAT表,代码如下:
void pat_init(void)
{
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;
if (pat_disabled)
return;
if ((c->x86_vendor == X86_VENDOR_INTEL) &&
(((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
/*
* PAT support with the lower four entries. Intel Pentium 2,
* 3, M, and 4 are affected by PAT errata, which makes the
* upper four entries unusable. To be on the safe side, we don't
* use those.
*
* PTE encoding:
* PAT
* |PCD
* ||PWT PAT
* ||| slot
* 000 0 WB : _PAGE_CACHE_MODE_WB
* 001 1 WC : _PAGE_CACHE_MODE_WC
* 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
* 011 3 UC : _PAGE_CACHE_MODE_UC
* PAT bit unused
*
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
* robustness in the presence of errata that might cause
* the high PAT bit to be ignored. This way, a buggy slot 7
* access will hit slot 3, and slot 3 is UC, so at worst
* we lose performance without causing a correctness issue.
* Pentium 4 erratum N46 is an example for such an erratum,
* although we try not to use PAT at all on affected CPUs.
*
* PTE encoding:
* PAT
* |PCD
* ||PWT PAT
* ||| slot
* 000 0 WB : _PAGE_CACHE_MODE_WB
* 001 1 WC : _PAGE_CACHE_MODE_WC
* 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
* 011 3 UC : _PAGE_CACHE_MODE_UC
* 100 4 WB : Reserved
* 101 5 WP : _PAGE_CACHE_MODE_WP
* 110 6 UC-: Reserved
* 111 7 WT : _PAGE_CACHE_MODE_WT
*
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
}
if (!boot_cpu_done) {
pat_bsp_init(pat);
boot_cpu_done = true;
} else {
pat_ap_init(pat);
}
}
static void pat_bsp_init(u64 pat)
{
u64 tmp_pat;
if (!boot_cpu_has(X86_FEATURE_PAT)) {
pat_disable("PAT not supported by CPU.");
return;
}
rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
if (!tmp_pat) {
pat_disable("PAT MSR is 0, disabled.");
return;
}
wrmsrl(MSR_IA32_CR_PAT, pat);
pat_initialized = true;
__init_cache_modes(pat);
}
static void __init_cache_modes(u64 pat)
{
enum page_cache_mode cache;
char pat_msg[33];
int i;
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
pat_msg + 4 * i);
update_cache_mode_entry(i, cache);
}
pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
init_cm_done = true;
}
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
*
* The resulting bits for PWT, PCD and PAT should be chosen in a way
* to have the WB mode at index 0 (all bits clear). This is the default
* right now and likely would break too much if changed.
*/
#ifndef __ASSEMBLY__
enum page_cache_mode {
_PAGE_CACHE_MODE_WB = 0,
_PAGE_CACHE_MODE_WC = 1,
_PAGE_CACHE_MODE_UC_MINUS = 2,
_PAGE_CACHE_MODE_UC = 3,
_PAGE_CACHE_MODE_WT = 4,
_PAGE_CACHE_MODE_WP = 5,
_PAGE_CACHE_MODE_NUM = 8
};
#endif
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
{
enum page_cache_mode cache;
char *cache_mode;
switch (pat_val) {
case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
default: cache = CM(WB); cache_mode = "WB "; break;
}
memcpy(msg, cache_mode, 4);
return cache;
}
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
#define __cm_idx2pte(i) \
((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \
(((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
(((i) & 1) << _PAGE_BIT_PWT))
变量pat的值最终会被写入寄存器MSR_IA32_CR_PAT,其中宏PAT和相应的宏定义如下,比如PAT(1, WC)即设置变量pat的第1个字节为PAT_WC(值为1):
enum {
PAT_UC = 0, /* uncached */
PAT_WC = 1, /* Write combining */
PAT_WT = 4, /* Write Through */
PAT_WP = 5, /* Write Protected */
PAT_WB = 6, /* Write Back (default) */
PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
};linux-5.4.191\arch\x86\mm\pat.c
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
随后调用函数pat_bsp_init->wrmsrl(MSR_IA32_CR_PAT, pat)将寄存器MSR_IA32_CR_PAT的值设置为pat。最后调用update_cache_mode_entry设置__cachemode2pte_tbl(cache mode到pte的PAT、PCD、PWT位转换)和__pte2cachemode_tbl(pte的PAT、PCD、PWT组成的index到cache mode的转换,见注释__pte2cachemode_tbl[] are the caching attribute bits of the pte (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2),内核设置cache mode或pte时就会使用这两个表项。它们的初始值如下:
linux-5.4.191\arch\x86\mm\init.c
uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
EXPORT_SYMBOL(__cachemode2pte_tbl);
uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
[__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
EXPORT_SYMBOL(__pte2cachemode_tbl);
PAT的使用
下面代码设置vma的vm_page_prot为uncache:
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
宏pgprot_noncached的代码如下,其中函数cachemode2protval通过之前设置的__cachemode2pte_tbl将_PAGE_CACHE_MODE_UC_MINUS转换为了pte的属性,然后与prot按位或操作,最后将结果赋值给vma->vm_page_prot(为什么不先清楚pte的cache属性然后再进行或操作?arm64和loongarch64的pgprot_noncached实现都是先清除cache mask然后再进行或操作。为了组合效果?例如_PAGE_CACHE_MODE_WC的index为001,_PAGE_CACHE_MODE_UC_MINUS的index为010,两个进行或操作后index就变为011即_PAGE_CACHE_MODE_UC):
#define pgprot_noncached(prot) \
((boot_cpu_data.x86 > 3) \
? (__pgprot(pgprot_val(prot) | \
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
: (prot))
static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
{
if (likely(pcm == 0))
return 0;
return __cachemode2pte_tbl[pcm];
}
参考文档
《IA-32 Intel® Architecture Software Developer’s Manual Volume 3:System Programming Guide》