kernel: mce: [Hardware Error]: Machine check events logged(二)内存错误
1. 内存错误处理的内核函数调用栈
在dmesg中,kernel: mce: [Hardware Error]: Machine check events logged如果记录的是内存错误,其错误处理的函数调用关系定义在如下位置:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc.c
函数调用堆栈为:
edac_mc_handle_error =》edac_raw_mc_handle_error =》edac_ce_error
- edac_mc_handle_error
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
const u16 error_count,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
const unsigned long syndrome,
const int top_layer,
const int mid_layer,
const int low_layer,
const char *msg,
const char *other_detail)
{
struct dimm_info *dimm;
char *p, *end;
int row = -1, chan = -1;
int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i, n_labels = 0;
struct edac_raw_error_desc *e = &mci->error_desc;
bool any_memory = true;
const char *prefix;
edac_dbg(3, "MC%d\n", mci->mc_idx);
/* Fills the error report buffer */
memset(e, 0, sizeof (*e));
e->error_count = error_count;
e->type = type;
e->top_layer = top_layer;
e->mid_layer = mid_layer;
e->low_layer = low_layer;
e->page_frame_number = page_frame_number;
e->offset_in_page = offset_in_page;
e->syndrome = syndrome;
/* need valid strings here for both: */
e->msg = msg ?: "";
e->other_detail = other_detail ?: "";
/*
* Check if the event report is consistent and if the memory location is
* known. If it is, the DIMM(s) label info will be filled and the DIMM's
* error counters will be incremented.
*/
for (i = 0; i < mci->n_layers; i++) {
if (pos[i] >= (int)mci->layers[i].size) {
edac_mc_printk(mci, KERN_ERR,
"INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
edac_layer_name[mci->layers[i].type],
pos[i], mci->layers[i].size);
/*
* Instead of just returning it, let's use what's
* known about the error. The increment routines and
* the DIMM filter logic will do the right thing by
* pointing the likely damaged DIMMs.
*/
pos[i] = -1;
}
if (pos[i] >= 0)
any_memory = false;
}
/*
* Get the dimm label/grain that applies to the match criteria.
* As the error algorithm may not be able to point to just one memory
* stick, the logic here will get all possible labels that could
* pottentially be affected by the error.
* On FB-DIMM memory controllers, for uncorrected errors, it is common
* to have only the MC channel and the MC dimm (also called "branch")
* but the channel is not known, as the memory is arranged in pairs,
* where each memory belongs to a separate channel within the same
* branch.
*/
p = e->label;
*p = '\0';
end = p + sizeof(e->label);
prefix = "";
mci_for_each_dimm(mci, dimm) {
if (top_layer >= 0 && top_layer != dimm->location[0])
continue;
if (mid_layer >= 0 && mid_layer != dimm->location[1])
continue;
if (low_layer >= 0 && low_layer != dimm->location[2])
continue;
/* get the max grain, over the error match range */
if (dimm->grain > e->grain)
e->grain = dimm->grain;
/*
* If the error is memory-controller wide, there's no need to
* seek for the affected DIMMs because the whole channel/memory
* controller/... may be affected. Also, don't show errors for
* empty DIMM slots.
*/
if (!dimm->nr_pages)
continue;
n_labels++;
if (n_labels > EDAC_MAX_LABELS) {
p = e->label;
*p = '\0';
} else {
p += scnprintf(p, end - p, "%s%s", prefix, dimm->label);
prefix = OTHER_LABEL;
}
/*
* get csrow/channel of the DIMM, in order to allow
* incrementing the compat API counters
*/
edac_dbg(4, "%s csrows map: (%d,%d)\n",
mci->csbased ? "rank" : "dimm",
dimm->csrow, dimm->cschannel);
if (row == -1)
row = dimm->csrow;
else if (row >= 0 && row != dimm->csrow)
row = -2;
if (chan == -1)
chan = dimm->cschannel;
else if (chan >= 0 && chan != dimm->cschannel)
chan = -2;
}
if (any_memory)
strscpy(e->label, "any memory", sizeof(e->label));
else if (!*e->label)
strscpy(e->label, "unknown memory", sizeof(e->label));
edac_inc_csrow(e, row, chan);
/* Fill the RAM location data */
p = e->location;
end = p + sizeof(e->location);
prefix = "";
for (i = 0; i < mci->n_layers; i++) {
if (pos[i] < 0)
continue;
p += scnprintf(p, end - p, "%s%s:%d", prefix,
edac_layer_name[mci->layers[i].type], pos[i]);
prefix = " ";
}
edac_raw_mc_handle_error(e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
- edac_raw_mc_handle_error
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e)
{
struct mem_ctl_info *mci = error_desc_to_mci(e);
u8 grain_bits;
/* Sanity-check driver-supplied grain value. */
if (WARN_ON_ONCE(!e->grain))
e->grain = 1;
grain_bits = fls_long(e->grain - 1);
/* Report the error via the trace interface */
if (IS_ENABLED(CONFIG_RAS))
trace_mc_event(e->type, e->msg, e->label, e->error_count,
mci->mc_idx, e->top_layer, e->mid_layer,
e->low_layer,
(e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
grain_bits, e->syndrome, e->other_detail);
if (e->type == HW_EVENT_ERR_CORRECTED)
edac_ce_error(e);
else
edac_ue_error(e);
}
EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
- edac_ce_error
static void edac_ce_error(struct edac_raw_error_desc *e)
{
struct mem_ctl_info *mci = error_desc_to_mci(e);
unsigned long remapped_page;
if (edac_mc_get_log_ce()) {
edac_mc_printk(mci, KERN_WARNING,
"%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n",
e->error_count, e->msg,
*e->msg ? " " : "",
e->label, e->location, e->page_frame_number, e->offset_in_page,
e->grain, e->syndrome,
*e->other_detail ? " - " : "",
e->other_detail);
}
edac_inc_ce_error(e);
if (mci->scrub_mode == SCRUB_SW_SRC) {
/*
* Some memory controllers (called MCs below) can remap
* memory so that it is still available at a different
* address when PCI devices map into memory.
* MC's that can't do this, lose the memory where PCI
* devices are mapped. This mapping is MC-dependent
* and so we call back into the MC driver for it to
* map the MC page to a physical (CPU) page which can
* then be mapped to a virtual page - which can then
* be scrubbed.
*/
remapped_page = mci->ctl_page_to_phys ?
mci->ctl_page_to_phys(mci, e->page_frame_number) :
e->page_frame_number;
edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain);
}
}
2. 实例:无法纠正的读内存错误
(1) 错误现象
https://access.redhat.com/solutions/6997755
Feb 8 08:45:20 abcxyz kernel: EDAC MC1: 0 CE memory read error on CPU_SrcID#0_MC#1_Chan#2_DIMM#0 (channel:2 slot:0 page:0x2870e08 offset:0xb40 grain:32 syndrome:0x0 - err_code:0x0000:0x009f socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170)
Feb 8 08:45:20 abcxyz kernel: soft offline: 0x2870e08: migration failed 1, type 2fffff00008000
Feb 8 08:45:21 abcxyz kernel: MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault at 2aab464098c8
这条日志信息系统在2023年2月8日的08:45:20记录了一个内存读取错误。
这个错误是通过EDAC(Error Detection and Correction)机制检测到的,这里报告的是0 CE memory read error
,这意味着没有检测到可纠正的错误(Correctable Errors, CEs)。
(2)日志分析
第一条日志:EDAC MC1: 0 CE memory read error
- EDAC MC1: 表示这是由内存控制器1(Memory Controller 1)报告的EDAC错误。
- 0 CE: 表示可纠正的错误(Correctable Error)为0,即硬件检测到内存中的数据错误,但无法纠正正确。
- CPU_SrcID#0_MC#1_Chan#2_DIMM#0: 表示错误发生在以下位置:
- CPU_SrcID#0: CPU源ID为0(即第0个CPU)。
- MC#1: 内存控制器1。
- Chan#2: 内存通道2。
- DIMM#0: 内存插槽0。
- channel:2 slot:0: 进一步确认错误发生在通道2、插槽0的内存条上。
- page:0x2870e08 offset:0xb40: 错误发生的物理内存页框号和偏移量。
- grain:32: 内存错误的粒度(32字节)。
- syndrome:0x0: syndrome(综合症)是一个关键的诊断信息,它通常由ECC(Error-Correcting Code)内存提供。具体来说,syndrome值用于帮助识别和定位内存中的错误类型和位置。
- err_code:0x0000:0x009f: 错误代码,可能与硬件相关。
- socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170: 这些是内存的物理地址信息,包括:
- socket:0: CPU插槽0。
- imc:1: 集成内存控制器1。
- rank:0: 内存rank 0。
- bg:1 ba:2: Bank Group和Bank地址。
- row:0x1a970 col:0x170: 内存行和列地址。
第二条日志:soft offline: 0x2870e08: migration failed
- soft offline: 表示内核尝试将出错的内存页标记为“软下线”(即不再使用)。
- 0x2870e08: 出错的物理内存页地址。
- migration failed: 表示内核尝试将该内存页中的数据迁移到其他位置失败。
- type 2fffff00008000: 内存页的类型或状态。
第三条日志:MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault
- MCE: Machine Check Exception(机器检查异常),表示硬件检测到严重错误。
- Killing SomeAppThread:159778: 内核终止了进程
SomeAppThread
(PID为159778),因为该进程试图访问损坏的内存。 - hardware memory corruption fault: 表示这是一个硬件内存损坏错误。
- at 2aab464098c8: 错误发生的虚拟内存地址。
3. 实例:可纠正的读内存错误
(1) 错误现象
https://www.supermicro.org.cn/support/faqs/faq.cfm?faq=35300
kernel: EDAC MC1: 1 CE memory read error on CPU_SrcID#0_MC#1_Chan#0_DIMM#0 (channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50)
这条日志信息显示你的系统检测到了一个可纠正的内存读取错误(Correctable Error, CE)。以下是对你提供的日志信息的详细解释和建议措施:
(2)日志解析
-
EDAC MC1: 表示这是由第1个内存控制器(Memory Controller, MC)报告的。
-
1 CE memory read error: 表示在内存读取过程中检测到了一个可纠正的错误。这意味着系统能够自动纠正这个错误而不需要人工干预。
-
CPU_SrcID#0_MC#1_Chan#0_DIMM#0: 提供了关于哪个硬件组件出现错误的具体信息。这里是第0个CPU源ID、第1个内存控制器、第0通道和第0个DIMM模块。
-
(channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50):
- channel:0: 错误发生在第0通道。
- slot:0: 插槽编号为0。
- page:0x1354969: 物理内存页框号。
- offset:0x540: 偏移量。
- grain:32: 影响的最小单位大小是32字节。
- syndrome:0x0: 综合症值,用于错误检测。
- err_code:0101:0090: 具体的错误代码。
- socket:0: CPU插座编号。
- imc:1: 内存控制器编号。
- rank:1: 内存条的Rank编号。
- bg:3: Bank Group编号。
- ba:0: Bank Address编号。
- row:12a9b: 行地址。
- col:50: 列地址。
(3)是否有进程受到了影响
在第一个无法纠正的内存错误的实例中,可以看到,有明确的日志显示哪个进程收到了影响。
而对于可以纠正的内存错误,按道理说是没有进程收到影响的,但是万一系统里真的有进程处于异常状态呢?
- 计算物理地址
- page:0x1354969: 物理内存页框号。
- offset:0x540: 相对于页面起始地址的偏移量偏移量。
- grain:32: 影响的最小单位大小是32字节。
假设页面大小为4K:
物理页面地址:0x1354969000 (物理内存页框号 * 页面大小)
物理地址:0x1354969540 (物理页面地址 + 0x500)
- 对应的虚拟页面地址?
物理页面可能被多个虚拟页面使用,很难反向追溯。
可以通过日志判断系统中是否有异常异常,然后通过如下两个信息推导出物理页面是否被进程使用:
/proc//maps
/proc//pagemap