kernel: mce: [Hardware Error]: Machine check events logged(二)内存错误

kernel: mce: [Hardware Error]: Machine check events logged(二)内存错误


1. 内存错误处理的内核函数调用栈

在dmesg中,kernel: mce: [Hardware Error]: Machine check events logged如果记录的是内存错误,其错误处理的函数调用关系定义在如下位置:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/edac/edac_mc.c

函数调用堆栈为:
edac_mc_handle_error =》edac_raw_mc_handle_error =》edac_ce_error

  • edac_mc_handle_error
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
			  struct mem_ctl_info *mci,
			  const u16 error_count,
			  const unsigned long page_frame_number,
			  const unsigned long offset_in_page,
			  const unsigned long syndrome,
			  const int top_layer,
			  const int mid_layer,
			  const int low_layer,
			  const char *msg,
			  const char *other_detail)
{
	struct dimm_info *dimm;
	char *p, *end;
	int row = -1, chan = -1;
	int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
	int i, n_labels = 0;
	struct edac_raw_error_desc *e = &mci->error_desc;
	bool any_memory = true;
	const char *prefix;

	edac_dbg(3, "MC%d\n", mci->mc_idx);

	/* Fills the error report buffer */
	memset(e, 0, sizeof (*e));
	e->error_count = error_count;
	e->type = type;
	e->top_layer = top_layer;
	e->mid_layer = mid_layer;
	e->low_layer = low_layer;
	e->page_frame_number = page_frame_number;
	e->offset_in_page = offset_in_page;
	e->syndrome = syndrome;
	/* need valid strings here for both: */
	e->msg = msg ?: "";
	e->other_detail = other_detail ?: "";

	/*
	 * Check if the event report is consistent and if the memory location is
	 * known. If it is, the DIMM(s) label info will be filled and the DIMM's
	 * error counters will be incremented.
	 */
	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] >= (int)mci->layers[i].size) {

			edac_mc_printk(mci, KERN_ERR,
				       "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
				       edac_layer_name[mci->layers[i].type],
				       pos[i], mci->layers[i].size);
			/*
			 * Instead of just returning it, let's use what's
			 * known about the error. The increment routines and
			 * the DIMM filter logic will do the right thing by
			 * pointing the likely damaged DIMMs.
			 */
			pos[i] = -1;
		}
		if (pos[i] >= 0)
			any_memory = false;
	}

	/*
	 * Get the dimm label/grain that applies to the match criteria.
	 * As the error algorithm may not be able to point to just one memory
	 * stick, the logic here will get all possible labels that could
	 * pottentially be affected by the error.
	 * On FB-DIMM memory controllers, for uncorrected errors, it is common
	 * to have only the MC channel and the MC dimm (also called "branch")
	 * but the channel is not known, as the memory is arranged in pairs,
	 * where each memory belongs to a separate channel within the same
	 * branch.
	 */
	p = e->label;
	*p = '\0';
	end = p + sizeof(e->label);
	prefix = "";

	mci_for_each_dimm(mci, dimm) {
		if (top_layer >= 0 && top_layer != dimm->location[0])
			continue;
		if (mid_layer >= 0 && mid_layer != dimm->location[1])
			continue;
		if (low_layer >= 0 && low_layer != dimm->location[2])
			continue;

		/* get the max grain, over the error match range */
		if (dimm->grain > e->grain)
			e->grain = dimm->grain;

		/*
		 * If the error is memory-controller wide, there's no need to
		 * seek for the affected DIMMs because the whole channel/memory
		 * controller/... may be affected. Also, don't show errors for
		 * empty DIMM slots.
		 */
		if (!dimm->nr_pages)
			continue;

		n_labels++;
		if (n_labels > EDAC_MAX_LABELS) {
			p = e->label;
			*p = '\0';
		} else {
			p += scnprintf(p, end - p, "%s%s", prefix, dimm->label);
			prefix = OTHER_LABEL;
		}

		/*
		 * get csrow/channel of the DIMM, in order to allow
		 * incrementing the compat API counters
		 */
		edac_dbg(4, "%s csrows map: (%d,%d)\n",
			mci->csbased ? "rank" : "dimm",
			dimm->csrow, dimm->cschannel);
		if (row == -1)
			row = dimm->csrow;
		else if (row >= 0 && row != dimm->csrow)
			row = -2;

		if (chan == -1)
			chan = dimm->cschannel;
		else if (chan >= 0 && chan != dimm->cschannel)
			chan = -2;
	}

	if (any_memory)
		strscpy(e->label, "any memory", sizeof(e->label));
	else if (!*e->label)
		strscpy(e->label, "unknown memory", sizeof(e->label));

	edac_inc_csrow(e, row, chan);

	/* Fill the RAM location data */
	p = e->location;
	end = p + sizeof(e->location);
	prefix = "";

	for (i = 0; i < mci->n_layers; i++) {
		if (pos[i] < 0)
			continue;

		p += scnprintf(p, end - p, "%s%s:%d", prefix,
			       edac_layer_name[mci->layers[i].type], pos[i]);
		prefix = " ";
	}

	edac_raw_mc_handle_error(e);
}
EXPORT_SYMBOL_GPL(edac_mc_handle_error);
  • edac_raw_mc_handle_error
void edac_raw_mc_handle_error(struct edac_raw_error_desc *e)
{
	struct mem_ctl_info *mci = error_desc_to_mci(e);
	u8 grain_bits;

	/* Sanity-check driver-supplied grain value. */
	if (WARN_ON_ONCE(!e->grain))
		e->grain = 1;

	grain_bits = fls_long(e->grain - 1);

	/* Report the error via the trace interface */
	if (IS_ENABLED(CONFIG_RAS))
		trace_mc_event(e->type, e->msg, e->label, e->error_count,
			       mci->mc_idx, e->top_layer, e->mid_layer,
			       e->low_layer,
			       (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
			       grain_bits, e->syndrome, e->other_detail);

	if (e->type == HW_EVENT_ERR_CORRECTED)
		edac_ce_error(e);
	else
		edac_ue_error(e);
}
EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
  • edac_ce_error
static void edac_ce_error(struct edac_raw_error_desc *e)
{
	struct mem_ctl_info *mci = error_desc_to_mci(e);
	unsigned long remapped_page;

	if (edac_mc_get_log_ce()) {
		edac_mc_printk(mci, KERN_WARNING,
			"%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n",
			e->error_count, e->msg,
			*e->msg ? " " : "",
			e->label, e->location, e->page_frame_number, e->offset_in_page,
			e->grain, e->syndrome,
			*e->other_detail ? " - " : "",
			e->other_detail);
	}

	edac_inc_ce_error(e);

	if (mci->scrub_mode == SCRUB_SW_SRC) {
		/*
			* Some memory controllers (called MCs below) can remap
			* memory so that it is still available at a different
			* address when PCI devices map into memory.
			* MC's that can't do this, lose the memory where PCI
			* devices are mapped. This mapping is MC-dependent
			* and so we call back into the MC driver for it to
			* map the MC page to a physical (CPU) page which can
			* then be mapped to a virtual page - which can then
			* be scrubbed.
			*/
		remapped_page = mci->ctl_page_to_phys ?
			mci->ctl_page_to_phys(mci, e->page_frame_number) :
			e->page_frame_number;

		edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain);
	}
}

2. 实例:无法纠正的读内存错误

(1) 错误现象

https://access.redhat.com/solutions/6997755

Feb  8 08:45:20 abcxyz kernel: EDAC MC1: 0 CE memory read error on CPU_SrcID#0_MC#1_Chan#2_DIMM#0 (channel:2 slot:0 page:0x2870e08 offset:0xb40 grain:32 syndrome:0x0 -  err_code:0x0000:0x009f socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170)
Feb  8 08:45:20 abcxyz kernel: soft offline: 0x2870e08: migration failed 1, type 2fffff00008000
Feb  8 08:45:21 abcxyz kernel: MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault at 2aab464098c8

这条日志信息系统在2023年2月8日的08:45:20记录了一个内存读取错误。

这个错误是通过EDAC(Error Detection and Correction)机制检测到的,这里报告的是0 CE memory read error,这意味着没有检测到可纠正的错误(Correctable Errors, CEs)。

(2)日志分析

第一条日志:EDAC MC1: 0 CE memory read error

  • EDAC MC1: 表示这是由内存控制器1(Memory Controller 1)报告的EDAC错误。
  • 0 CE: 表示可纠正的错误(Correctable Error)为0,即硬件检测到内存中的数据错误,但无法纠正正确。
  • CPU_SrcID#0_MC#1_Chan#2_DIMM#0: 表示错误发生在以下位置:
    • CPU_SrcID#0: CPU源ID为0(即第0个CPU)。
    • MC#1: 内存控制器1。
    • Chan#2: 内存通道2。
    • DIMM#0: 内存插槽0。
  • channel:2 slot:0: 进一步确认错误发生在通道2、插槽0的内存条上。
  • page:0x2870e08 offset:0xb40: 错误发生的物理内存页框号和偏移量。
  • grain:32: 内存错误的粒度(32字节)。
  • syndrome:0x0: syndrome(综合症)是一个关键的诊断信息,它通常由ECC(Error-Correcting Code)内存提供。具体来说,syndrome值用于帮助识别和定位内存中的错误类型和位置。
  • err_code:0x0000:0x009f: 错误代码,可能与硬件相关。
  • socket:0 imc:1 rank:0 bg:1 ba:2 row:0x1a970 col:0x170: 这些是内存的物理地址信息,包括:
    • socket:0: CPU插槽0。
    • imc:1: 集成内存控制器1。
    • rank:0: 内存rank 0。
    • bg:1 ba:2: Bank Group和Bank地址。
    • row:0x1a970 col:0x170: 内存行和列地址。

第二条日志:soft offline: 0x2870e08: migration failed

  • soft offline: 表示内核尝试将出错的内存页标记为“软下线”(即不再使用)。
  • 0x2870e08: 出错的物理内存页地址。
  • migration failed: 表示内核尝试将该内存页中的数据迁移到其他位置失败。
  • type 2fffff00008000: 内存页的类型或状态。

第三条日志:MCE: Killing SomeAppThread:159778 due to hardware memory corruption fault

  • MCE: Machine Check Exception(机器检查异常),表示硬件检测到严重错误。
  • Killing SomeAppThread:159778: 内核终止了进程SomeAppThread(PID为159778),因为该进程试图访问损坏的内存。
  • hardware memory corruption fault: 表示这是一个硬件内存损坏错误。
  • at 2aab464098c8: 错误发生的虚拟内存地址。

3. 实例:可纠正的读内存错误

(1) 错误现象

https://www.supermicro.org.cn/support/faqs/faq.cfm?faq=35300

kernel: EDAC MC1: 1 CE memory read error on CPU_SrcID#0_MC#1_Chan#0_DIMM#0 (channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50)

这条日志信息显示你的系统检测到了一个可纠正的内存读取错误(Correctable Error, CE)。以下是对你提供的日志信息的详细解释和建议措施:

(2)日志解析

  • EDAC MC1: 表示这是由第1个内存控制器(Memory Controller, MC)报告的。

  • 1 CE memory read error: 表示在内存读取过程中检测到了一个可纠正的错误。这意味着系统能够自动纠正这个错误而不需要人工干预。

  • CPU_SrcID#0_MC#1_Chan#0_DIMM#0: 提供了关于哪个硬件组件出现错误的具体信息。这里是第0个CPU源ID、第1个内存控制器、第0通道和第0个DIMM模块。

  • (channel:0 slot:0 page:0x1354969 offset:0x540 grain:32 syndrome:0x0 - err_code:0101:0090 socket:0 imc:1 rank:1 bg:3 ba:0 row:12a9b col:50):

    • channel:0: 错误发生在第0通道。
    • slot:0: 插槽编号为0。
    • page:0x1354969: 物理内存页框号。
    • offset:0x540: 偏移量。
    • grain:32: 影响的最小单位大小是32字节。
    • syndrome:0x0: 综合症值,用于错误检测。
    • err_code:0101:0090: 具体的错误代码。
    • socket:0: CPU插座编号。
    • imc:1: 内存控制器编号。
    • rank:1: 内存条的Rank编号。
    • bg:3: Bank Group编号。
    • ba:0: Bank Address编号。
    • row:12a9b: 行地址。
    • col:50: 列地址。

(3)是否有进程受到了影响

在第一个无法纠正的内存错误的实例中,可以看到,有明确的日志显示哪个进程收到了影响。

而对于可以纠正的内存错误,按道理说是没有进程收到影响的,但是万一系统里真的有进程处于异常状态呢?

  • 计算物理地址
    • page:0x1354969: 物理内存页框号。
    • offset:0x540: 相对于页面起始地址的偏移量偏移量。
    • grain:32: 影响的最小单位大小是32字节。

假设页面大小为4K:
物理页面地址:0x1354969000 (物理内存页框号 * 页面大小)
物理地址:0x1354969540 (物理页面地址 + 0x500)

  • 对应的虚拟页面地址?

物理页面可能被多个虚拟页面使用,很难反向追溯。

可以通过日志判断系统中是否有异常异常,然后通过如下两个信息推导出物理页面是否被进程使用:
/proc//maps
/proc//pagemap

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值