kernel: mce: [Hardware Error]: Machine check events logged(二)内存错误

三水问海

已于 2025-02-27 10:34:37 修改

阅读量1.1k

点赞数 19

分类专栏： RAS 文章标签： linux

于 2025-02-21 11:15:37 首次发布

本文链接：https://blog.youkuaiyun.com/aspirestro/article/details/145682165

版权

RAS 专栏收录该内容

1 篇文章

订阅专栏

kernel: mce: [Hardware Error]: Machine check events logged(二)内存错误

1. 内存错误处理的内核函数调用栈

在dmesg中，kernel: mce: [Hardware Error]: Machine check events logged如果记录的是内存错误，其错误处理的函数调用关系定义在如下位置：
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc.c

函数调用堆栈为：
edac_mc_handle_error =》edac_raw_mc_handle_error =》edac_ce_error

edac_mc_handle_error

void edac_mc_handle_error(const enum hw_event_mc_err_type type,
			  struct mem_ctl_info *mci,
			  const u16 error_count,
			  const unsigned long page_frame_number,
			  const unsigned long offset_in_page,
			  const unsigned long syndrome,
			  const int top_layer,
			  const int mid_layer,
			  const int low_layer,
			  const char *msg,
			  const char *other_detail)
{
	struct dimm_info *dimm;
	char *p, *end;
	int row = -1, chan = -1;
	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
	int i, n_labels = 0;
	struct edac_raw_error_desc *e = &mci->error_desc;
	bool any_memory = true;
	const char *prefix;

	edac_dbg(3, "MC%d\n", mci->mc_idx);

	/* Fills the error report buffer */
	memset(e, 0, sizeof (*e));
	e->error_count = error_count;
	e->type = type;
	e->top_layer = top_layer;
	e->mid_layer = mid_layer;
	e->low_layer = low_layer;
	e->page_frame_number = page_frame_number;
	e->offset_in_page = offset_in_page;
	e->syndrome = syndrome;
	/* need valid strings here for both: */
	e->msg = msg ?: "";
	e->other_detail = other_detail ?: "";

	/*
	 * Check if the event report is consistent and if the memory location is
	 * known. If it is, the DIMM(s) label info will be filled and the DIMM's
	 * error counters will be incremented.
	 */
	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] >= (int)mci->layers[i].size) {

			edac_mc_printk(mci, KERN_ERR,
				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
				       edac_layer_name[mci->layers[i].type],
				       pos[i], mci->layers[i].size);
			/*
			 * Instead of just returning it, let's use what's
			 * known about the error. The increment routines and
			 * the DIMM filter logic will do the right thing by
			 * pointing the likely damaged DIMMs.
			 */
			pos[i] = -1;
		}
		if (pos[i] >= 0)
			any_memory = false;
	}

	/*
	 * Get the dimm label/grain that applies to the match criteria.
	 * As the error algorithm may not be able to point to just one memory
	 * stick, the logic here will get all possible labels that could
	 * pottentially be affected by the error.
	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
	 * to have only the MC channel and the MC dimm (also called "branch")
	 * but the channel is not known, as the memory is arranged in pairs,
	 * where each memory belongs to a separate channel within the same
	 * branch.
	 */
	p = e->label;
	*p = '\0';
	end = p + sizeof(e->label);
	prefix = "";

	mci_for_each_dimm(mci, dimm) {
		if (top_layer >= 0 && top_layer != dimm->location[0])
			continue;
		if (mid_layer >= 0 && mid_layer != dimm->location[1])
			continue;
		if (low_layer >= 0 && low_layer != dimm->location[2])
			continue;

		/* get the max grain, over the error match range */
		if (dimm->grain > e->grain)
			e->grain = dimm->grain;

		/*
		 * If the error is memory-controller wide, there's no need to
		 * seek for the affected DIMMs because the whole channel/memory
		 * controller/... may be affected. Also, don't show errors for
		 * empty DIMM slots.
		 */
		if (!dimm->nr_pages)
			continue;

		n_labels++;
		if (n_labels > EDAC_MAX_LABELS) {
			p = e->label;
			*p = '\0';
		} else {
			p += scnprintf(p, end - p, "%s%s", prefix, dimm->label);
			prefix = OTHER_LABEL;
		}

		/*
		 * get csrow/channel of the DIMM, in order to allow
		 * incrementing the compat API counters
		 */
		edac_dbg(4, "%s csrows map: (%d,%d)\n",
			mci->csbased ? "rank" : "dimm",
			dimm->csrow, dimm->cschannel);
		if (row == -1)
			row = dimm->csrow;
		else if (row >= 0 && row != dimm->csrow)
			row = -2;

		if (chan == -1)
			chan = dimm->cschannel;
		else if (chan >= 0 && chan != dimm->cschannel)
			chan = -2;
	}

	if (any_memory)
		strscpy(e->label, "any memory", sizeof(e->label));
	else if (!*e->label)
		strscpy(e->label, "unknown memory", sizeof(e->label));

	edac_inc_csrow(e, row, chan);

	/* Fill the RAM location data */
	p = e->location;
	end = p + sizeof(e->location);
	prefix = "";

	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] < 0)
			continue;

		p += scnprintf(p, end - p, "%s%s:%d", prefix,
			       edac_layer_name[mci->layers[i].type], pos[i]);
		prefix = " ";
	}

	edac_raw_mc_handle_error(e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);

edac_raw_mc_handle_error

void edac_raw_mc_handle_error(struct edac_raw_error_desc *e)
{
	struct mem_ctl_info *mci = error_desc_to_mci(e);
	u8 grain_bits;

	/* Sanity-check driver-supplied grain value. */
	if (WARN_ON_ONCE(!e->grain))
		e->grain = 1;

	grain_bits = fls_long(e->grain - 1);

	/* Report the error via the trace interface */
	if (IS_ENABLED(CONFIG_RAS))
		trace_mc_event(e->type, e->msg, e->label, e->error_count,
			       mci->mc_idx, e->top_layer, e->mid_layer,
			       e->low_layer,
			       (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
			       grain_bits, e->syndrome, e->other_detail);

	if (e->type == HW_EVENT_ERR_CORRECTED)
		edac_ce_error(e);
	else
		edac_ue_error(e);
}
EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);

edac_ce_error

static void edac_ce_error(struct edac_raw_error_desc *e)
{
	struct mem_ctl_info *mci = error_desc_to_mci(e);
	unsigned long remapped_page;

	if (edac_mc_get_log_ce()) {
		edac_mc_printk(mci, KERN_WARNING,
			"%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n",
			e->error_count, e->msg,
			*e->msg ? " " : "",
			e->label, e->location, e->page_frame_number, e->offset_in_page,
			e->grain, e->syndrome,
			*e->other_detail ? " - " : "",
			e->other_detail);
	}

	edac_inc_ce_error(e);

	if (mci->scrub_mode == SCRUB_SW_SRC) {
		/*
			* Some memory controllers (called MCs below) can remap
			* memory so that it is still available at a different
			* address when PCI devices map into memory.
			* MC's that can't do this, lose the memory where PCI
			* devices are mapped. This mapping is MC-dependent
			* and so we call back into the MC driver for it to
			* map the MC page to a physical (CPU) page which can
			* then be mapped to a virtual page - which can then
			* be scrubbed.
			*/
		remapped_page = mci->ctl_page_to_phys ?
			mci->ctl_page_to_phys(mci, e->page_frame_number) :
			e->page_frame_number;

		edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain);
	}
}

2. 实例：无法纠正的读内存错误

（1）错误现象

https://access.redhat.com/solutions/6997755

Feb  8 08:45:20 abcxyz kernel: EDAC MC1: 0 CE memory read error on CPU_SrcID#0_MC#1_Chan#2_DIMM#0 (channel:2 slot:0 page:0x2870e08 offset:0xb40 grain:32 syndrome:0x0 -  err_code:0x0000:0x009f socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170)
Feb  8 08:45:20 abcxyz kernel: soft offline: 0x2870e08: migration failed 1, type 2fffff00008000
Feb  8 08:45:21 abcxyz kernel: MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault at 2aab464098c8

这条日志信息系统在2023年2月8日的08:45:20记录了一个内存读取错误。

这个错误是通过EDAC（Error Detection and Correction）机制检测到的，这里报告的是0 CE memory read error，这意味着没有检测到可纠正的错误（Correctable Errors, CEs）。

（2）日志分析

第一条日志：EDAC MC1: 0 CE memory read error

EDAC MC1: 表示这是由内存控制器1（Memory Controller 1）报告的EDAC错误。
0 CE: 表示可纠正的错误（Correctable Error）为0，即硬件检测到内存中的数据错误，但无法纠正正确。
CPU_SrcID#0_MC#1_Chan#2_DIMM#0: 表示错误发生在以下位置：
- CPU_SrcID#0: CPU源ID为0（即第0个CPU）。
- MC#1: 内存控制器1。
- Chan#2: 内存通道2。
- DIMM#0: 内存插槽0。
channel:2 slot:0: 进一步确认错误发生在通道2、插槽0的内存条上。
page:0x2870e08 offset:0xb40: 错误发生的物理内存页框号和偏移量。
grain:32: 内存错误的粒度（32字节）。
syndrome:0x0: syndrome（综合症）是一个关键的诊断信息，它通常由ECC（Error-Correcting Code）内存提供。具体来说，syndrome值用于帮助识别和定位内存中的错误类型和位置。
err_code:0x0000:0x009f: 错误代码，可能与硬件相关。
socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170: 这些是内存的物理地址信息，包括：
- socket:0: CPU插槽0。
- imc:1: 集成内存控制器1。
- rank:0: 内存rank 0。
- bg:1 ba:2: Bank Group和Bank地址。
- row:0x1a970 col:0x170: 内存行和列地址。

第二条日志：soft offline: 0x2870e08: migration failed

soft offline: 表示内核尝试将出错的内存页标记为“软下线”（即不再使用）。
0x2870e08: 出错的物理内存页地址。
migration failed: 表示内核尝试将该内存页中的数据迁移到其他位置失败。
type 2fffff00008000: 内存页的类型或状态。

第三条日志：MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault

MCE: Machine Check Exception（机器检查异常），表示硬件检测到严重错误。
Killing SomeAppThread:159778: 内核终止了进程SomeAppThread（PID为159778），因为该进程试图访问损坏的内存。
hardware memory corruption fault: 表示这是一个硬件内存损坏错误。
at 2aab464098c8: 错误发生的虚拟内存地址。

3. 实例：可纠正的读内存错误

（1）错误现象

https://www.supermicro.org.cn/support/faqs/faq.cfm?faq=35300

kernel: EDAC MC1: 1 CE memory read error on CPU_SrcID#0_MC#1_Chan#0_DIMM#0 (channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50)

这条日志信息显示你的系统检测到了一个可纠正的内存读取错误（Correctable Error, CE）。以下是对你提供的日志信息的详细解释和建议措施：

（2）日志解析

EDAC MC1: 表示这是由第1个内存控制器（Memory Controller, MC）报告的。
1 CE memory read error: 表示在内存读取过程中检测到了一个可纠正的错误。这意味着系统能够自动纠正这个错误而不需要人工干预。
CPU_SrcID#0_MC#1_Chan#0_DIMM#0: 提供了关于哪个硬件组件出现错误的具体信息。这里是第0个CPU源ID、第1个内存控制器、第0通道和第0个DIMM模块。
(channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50):
- channel:0: 错误发生在第0通道。
- slot:0: 插槽编号为0。
- page:0x1354969: 物理内存页框号。
- offset:0x540: 偏移量。
- grain:32: 影响的最小单位大小是32字节。
- syndrome:0x0: 综合症值，用于错误检测。
- err_code:0101:0090: 具体的错误代码。
- socket:0: CPU插座编号。
- imc:1: 内存控制器编号。
- rank:1: 内存条的Rank编号。
- bg:3: Bank Group编号。
- ba:0: Bank Address编号。
- row:12a9b: 行地址。
- col:50: 列地址。