diff mbox series

[RFC] cxl: avoid duplicating report from MCE & device

Message ID 20240618165310.877974-1-ruansy.fnst@fujitsu.com (mailing list archive)
State New, archived
Headers show
Series [RFC] cxl: avoid duplicating report from MCE & device | expand

Commit Message

Shiyang Ruan June 18, 2024, 4:53 p.m. UTC
Background:
Since CXL device is a memory device, while CPU consumes a poison page of 
CXL device, it always triggers a MCE by interrupt (INT18), no matter 
which-First path is configured.  This is the first report.  Then 
currently, in FW-First path, the poison event is transferred according 
to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
 -> CPER -> trace report.  This is the second one.  These two reports
are indicating the same poisoning page, which is the so-called "duplicate
report"[1].  And the memory_failure() handling I'm trying to add in
OS-First path could also be another duplicate report.

Hope the flow below could make it easier to understand:
CPU accesses bad memory on CXL device, then
 -> MCE (INT18), *always* report (1)
 -> * FW-First (implemented now)
      -> CXL device -> FW
	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
    * OS-First (not implemented yet, I'm working on it)
      -> CXL device -> MSI
	      -> OS:CXL driver -> memory_failure() (2.b)
so, the (1) and (2.a/b) are duplicated.

(I didn't get response in my reply for [1] while I have to make patch to
solve this problem, so please correct me if my understanding is wrong.)

This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
to check whether the current poison page has been reported (if yes,
stop the notifier chain, won't call the following memory_failure()
to report), into `x86_mce_decoder_chain`.  In this way, if the poison
page already handled(recorded and reported) in (1) or (2), the other one
won't duplicate the report.  The record could be clear when
cxl_clear_poison() is called.

[1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 arch/x86/include/asm/mce.h |   1 +
 drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/memdev.c  |   6 +-
 drivers/cxl/cxlmem.h       |   3 +
 4 files changed, 139 insertions(+), 1 deletion(-)

Comments

Dave Jiang June 18, 2024, 11:35 p.m. UTC | #1
On 6/18/24 9:53 AM, Shiyang Ruan wrote:
> Background:
> Since CXL device is a memory device, while CPU consumes a poison page of 
> CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> which-First path is configured.  This is the first report.  Then 
> currently, in FW-First path, the poison event is transferred according 
> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
>  -> CPER -> trace report.  This is the second one.  These two reports
> are indicating the same poisoning page, which is the so-called "duplicate
> report"[1].  And the memory_failure() handling I'm trying to add in
> OS-First path could also be another duplicate report.
> 
> Hope the flow below could make it easier to understand:
> CPU accesses bad memory on CXL device, then
>  -> MCE (INT18), *always* report (1)
>  -> * FW-First (implemented now)
>       -> CXL device -> FW
> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>     * OS-First (not implemented yet, I'm working on it)
>       -> CXL device -> MSI
> 	      -> OS:CXL driver -> memory_failure() (2.b)
> so, the (1) and (2.a/b) are duplicated.
> 
> (I didn't get response in my reply for [1] while I have to make patch to
> solve this problem, so please correct me if my understanding is wrong.)
> 
> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> to check whether the current poison page has been reported (if yes,
> stop the notifier chain, won't call the following memory_failure()
> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> page already handled(recorded and reported) in (1) or (2), the other one
> won't duplicate the report.  The record could be clear when
> cxl_clear_poison() is called.
> 
> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  arch/x86/include/asm/mce.h |   1 +
>  drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/memdev.c  |   6 +-
>  drivers/cxl/cxlmem.h       |   3 +
>  4 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index dfd2e9699bd7..d8109c48e7d9 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>  	MCE_PRIO_NFIT,
>  	MCE_PRIO_EXTLOG,
>  	MCE_PRIO_UC,
> +	MCE_PRIO_CXL,
>  	MCE_PRIO_EARLY,
>  	MCE_PRIO_CEC,
>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 2626f3fff201..0eb3c5401e81 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -4,6 +4,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/ktime.h>
>  #include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <asm/mce.h>
>  #include <asm/unaligned.h>
>  #include <cxlpci.h>
>  #include <cxlmem.h>
> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>  		if (cxlr)
>  			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>  
> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> +			return;
> +
>  		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>  			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>  						&evt->gen_media);
> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>  
> +struct cxl_mce_record {
> +	struct list_head node;
> +	u64 hpa;
> +};
> +LIST_HEAD(cxl_mce_records);
> +DEFINE_MUTEX(cxl_mce_mutex);
> +
> +bool cxl_mce_recorded(u64 hpa)
> +{
> +	struct cxl_mce_record *cur, *next, *rec;
> +	int rc;
> +
> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> +	if (rc)
> +		return false;
> +
> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> +		if (cur->hpa == hpa) {
> +			mutex_unlock(&cxl_mce_mutex);
> +			return true;
> +		}
> +	}
> +
> +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> +	rec->hpa = hpa;
> +	list_add(&cxl_mce_records, &rec->node);
> +
> +	mutex_unlock(&cxl_mce_mutex);
> +
> +	return false;
> +}
> +
> +void cxl_mce_clear(u64 hpa)
> +{
> +	struct cxl_mce_record *cur, *next;
> +	int rc;
> +
> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> +	if (rc)
> +		return;
> +
> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> +		if (cur->hpa == hpa) {
> +			list_del(&cur->node);
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&cxl_mce_mutex);
> +}
> +
> +struct cxl_contains_hpa_context {
> +	bool contains;
> +	u64 hpa;
> +};
> +
> +static int __cxl_contains_hpa(struct device *dev, void *arg)
> +{
> +	struct cxl_contains_hpa_context *ctx = arg;
> +	struct cxl_endpoint_decoder *cxled;
> +	struct range *range;
> +	u64 hpa = ctx->hpa;
> +
> +	if (!is_endpoint_decoder(dev))
> +		return 0;
> +
> +	cxled = to_cxl_endpoint_decoder(dev);
> +	range = &cxled->cxld.hpa_range;
> +
> +	if (range->start <= hpa && hpa <= range->end) {
> +		ctx->contains = true;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
> +{
> +	struct cxl_contains_hpa_context ctx = {
> +		.contains = false,
> +		.hpa = hpa,
> +	};
> +	struct cxl_port *port;
> +
> +	port = cxlmd->endpoint;
> +	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))

Maybe no need to check is_cxl_endpoint() if the port is retrieved from cxlmd->endpoint.

Also, in order to use cxl_num_decoders_committed(), cxl_region_rwsem must be held. See the lockdep_assert_held() in the function. Maybe add a
guard(cxl_regoin_rwsem);
before the if statement above.

DJ

> +		device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa);
> +
> +	return ctx.contains;
> +}
> +
> +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
> +			  void *data)
> +{
> +	struct mce *mce = (struct mce *)data;
> +	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
> +						    mce_notifier);
> +	u64 hpa;
> +
> +	if (!mce || !mce_usable_address(mce))
> +		return NOTIFY_DONE;
> +
> +	hpa = mce->addr & MCI_ADDR_PHYSADDR;
> +
> +	/* Check if the PFN is located on this CXL device */
> +	if (!pfn_valid(hpa >> PAGE_SHIFT) &&
> +	    !cxl_contains_hpa(mds->cxlds.cxlmd, hpa))
> +		return NOTIFY_DONE;
> +
> +	/*
> +	 * Search PFN in the cxl_mce_records, if already exists, don't continue
> +	 * to do memory_failure() to avoid a poison address being reported
> +	 * more than once.
> +	 */
> +	if (cxl_mce_recorded(hpa))
> +		return NOTIFY_STOP;
> +	else
> +		return NOTIFY_OK;
> +}
> +
>  struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>  {
>  	struct cxl_memdev_state *mds;
> @@ -1427,6 +1553,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>  	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
>  	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
>  
> +	mds->mce_notifier.notifier_call = cxl_handle_mce;
> +	mds->mce_notifier.priority = MCE_PRIO_CXL;
> +	mce_register_decode_chain(&mds->mce_notifier);
> +
>  	return mds;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 0277726afd04..aa3ac89d17be 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -376,10 +376,14 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
>  		goto out;
>  
>  	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> -	if (cxlr)
> +	if (cxlr) {
> +		u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> +
> +		cxl_mce_clear(hpa);
>  		dev_warn_once(mds->cxlds.dev,
>  			      "poison clear dpa:%#llx region: %s\n", dpa,
>  			      dev_name(&cxlr->dev));
> +	}
>  
>  	record = (struct cxl_poison_record) {
>  		.address = cpu_to_le64(dpa),
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 19aba81cdf13..fbf8d9f46984 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -501,6 +501,7 @@ struct cxl_memdev_state {
>  	struct cxl_fw_state fw;
>  
>  	struct rcuwait mbox_wait;
> +	struct notifier_block mce_notifier;
>  	int (*mbox_send)(struct cxl_memdev_state *mds,
>  			 struct cxl_mbox_cmd *cmd);
>  };
> @@ -836,6 +837,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
>  int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
>  int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
>  int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
> +bool cxl_mce_recorded(u64 pfn);
> +void cxl_mce_clear(u64 pfn);
>  
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
Shiyang Ruan June 19, 2024, 9:24 a.m. UTC | #2
在 2024/6/19 7:35, Dave Jiang 写道:
> 
> 
> On 6/18/24 9:53 AM, Shiyang Ruan wrote:
>> Background:
>> Since CXL device is a memory device, while CPU consumes a poison page of
>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>> which-First path is configured.  This is the first report.  Then
>> currently, in FW-First path, the poison event is transferred according
>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>   -> CPER -> trace report.  This is the second one.  These two reports
>> are indicating the same poisoning page, which is the so-called "duplicate
>> report"[1].  And the memory_failure() handling I'm trying to add in
>> OS-First path could also be another duplicate report.
>>
>> Hope the flow below could make it easier to understand:
>> CPU accesses bad memory on CXL device, then
>>   -> MCE (INT18), *always* report (1)
>>   -> * FW-First (implemented now)
>>        -> CXL device -> FW
>> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>      * OS-First (not implemented yet, I'm working on it)
>>        -> CXL device -> MSI
>> 	      -> OS:CXL driver -> memory_failure() (2.b)
>> so, the (1) and (2.a/b) are duplicated.
>>
>> (I didn't get response in my reply for [1] while I have to make patch to
>> solve this problem, so please correct me if my understanding is wrong.)
>>
>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>> to check whether the current poison page has been reported (if yes,
>> stop the notifier chain, won't call the following memory_failure()
>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>> page already handled(recorded and reported) in (1) or (2), the other one
>> won't duplicate the report.  The record could be clear when
>> cxl_clear_poison() is called.
>>
>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>

...

>> +
>> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
>> +{
>> +	struct cxl_contains_hpa_context ctx = {
>> +		.contains = false,
>> +		.hpa = hpa,
>> +	};
>> +	struct cxl_port *port;
>> +
>> +	port = cxlmd->endpoint;
>> +	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
> 
> Maybe no need to check is_cxl_endpoint() if the port is retrieved from cxlmd->endpoint.

OK, I'll remove this.

> 
> Also, in order to use cxl_num_decoders_committed(), cxl_region_rwsem must be held. See the lockdep_assert_held() in the function. Maybe add a
> guard(cxl_regoin_rwsem);
> before the if statement above.

Got it.  I didn't realize it before.  Will add it.


BTW, may I have your opinion on this proposal?  I'm not sure if the 
Background and problem described above are correct or not.  If not, it 
could lead me in the wrong direction.

Thank you very much!


--
Ruan.

> 
> DJ
>
Dave Jiang June 20, 2024, 3:51 p.m. UTC | #3
On 6/19/24 2:24 AM, Shiyang Ruan wrote:
> 
> 
> 在 2024/6/19 7:35, Dave Jiang 写道:
>>
>>
>> On 6/18/24 9:53 AM, Shiyang Ruan wrote:
>>> Background:
>>> Since CXL device is a memory device, while CPU consumes a poison page of
>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>>> which-First path is configured.  This is the first report.  Then
>>> currently, in FW-First path, the poison event is transferred according
>>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>>   -> CPER -> trace report.  This is the second one.  These two reports
>>> are indicating the same poisoning page, which is the so-called "duplicate
>>> report"[1].  And the memory_failure() handling I'm trying to add in
>>> OS-First path could also be another duplicate report.
>>>
>>> Hope the flow below could make it easier to understand:
>>> CPU accesses bad memory on CXL device, then
>>>   -> MCE (INT18), *always* report (1)
>>>   -> * FW-First (implemented now)
>>>        -> CXL device -> FW
>>>           -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>>      * OS-First (not implemented yet, I'm working on it)
>>>        -> CXL device -> MSI
>>>           -> OS:CXL driver -> memory_failure() (2.b)
>>> so, the (1) and (2.a/b) are duplicated.
>>>
>>> (I didn't get response in my reply for [1] while I have to make patch to
>>> solve this problem, so please correct me if my understanding is wrong.)
>>>
>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>>> to check whether the current poison page has been reported (if yes,
>>> stop the notifier chain, won't call the following memory_failure()
>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>>> page already handled(recorded and reported) in (1) or (2), the other one
>>> won't duplicate the report.  The record could be clear when
>>> cxl_clear_poison() is called.
>>>
>>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>>
> 
> ...
> 
>>> +
>>> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
>>> +{
>>> +    struct cxl_contains_hpa_context ctx = {
>>> +        .contains = false,
>>> +        .hpa = hpa,
>>> +    };
>>> +    struct cxl_port *port;
>>> +
>>> +    port = cxlmd->endpoint;
>>> +    if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
>>
>> Maybe no need to check is_cxl_endpoint() if the port is retrieved from cxlmd->endpoint.
> 
> OK, I'll remove this.
> 
>>
>> Also, in order to use cxl_num_decoders_committed(), cxl_region_rwsem must be held. See the lockdep_assert_held() in the function. Maybe add a
>> guard(cxl_regoin_rwsem);
>> before the if statement above.
> 
> Got it.  I didn't realize it before.  Will add it.
> 
> 
> BTW, may I have your opinion on this proposal?  I'm not sure if the Background and problem described above are correct or not.  If not, it could lead me in the wrong direction.

Patch looks ok to me, but I'm no RAS expert in this area. Lets wait for comments from Jonathan and Dan. 
> 
> Thank you very much!
> 
> 
> -- 
> Ruan.
> 
>>
>> DJ
>>
Jonathan Cameron June 20, 2024, 5:02 p.m. UTC | #4
On Wed, 19 Jun 2024 00:53:10 +0800
Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:

> Background:
> Since CXL device is a memory device, while CPU consumes a poison page of 
> CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> which-First path is configured.  This is the first report.  Then 
> currently, in FW-First path, the poison event is transferred according 
> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
>  -> CPER -> trace report.  This is the second one.  These two reports  
> are indicating the same poisoning page, which is the so-called "duplicate
> report"[1].  And the memory_failure() handling I'm trying to add in
> OS-First path could also be another duplicate report.
> 
> Hope the flow below could make it easier to understand:
> CPU accesses bad memory on CXL device, then
>  -> MCE (INT18), *always* report (1)
>  -> * FW-First (implemented now)
>       -> CXL device -> FW
> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)  
>     * OS-First (not implemented yet, I'm working on it)
>       -> CXL device -> MSI
> 	      -> OS:CXL driver -> memory_failure() (2.b)  
> so, the (1) and (2.a/b) are duplicated.
> 
> (I didn't get response in my reply for [1] while I have to make patch to
> solve this problem, so please correct me if my understanding is wrong.)
> 
> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> to check whether the current poison page has been reported (if yes,
> stop the notifier chain, won't call the following memory_failure()
> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> page already handled(recorded and reported) in (1) or (2), the other one
> won't duplicate the report.  The record could be clear when
> cxl_clear_poison() is called.
> 
> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>

So poison can be cleared in a number of ways and a CXL poison clear command
is unfortunately only one of them.  Some architectures have instructions
that guarantee to write a whole cacheline and can clear things as well.
I believe x86 does for starters.

+CC linux-edac and related maintainers / reviewers.
    linux-mm and hwpoison maintainer.

So I think this needs a more general solution that encompasses 
more general cleanup of poison.

Trivial comments inline.

Jonathan


> ---
>  arch/x86/include/asm/mce.h |   1 +
>  drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/memdev.c  |   6 +-
>  drivers/cxl/cxlmem.h       |   3 +
>  4 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index dfd2e9699bd7..d8109c48e7d9 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>  	MCE_PRIO_NFIT,
>  	MCE_PRIO_EXTLOG,
>  	MCE_PRIO_UC,
> +	MCE_PRIO_CXL,
>  	MCE_PRIO_EARLY,
>  	MCE_PRIO_CEC,
>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 2626f3fff201..0eb3c5401e81 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -4,6 +4,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/ktime.h>
>  #include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <asm/mce.h>
>  #include <asm/unaligned.h>
>  #include <cxlpci.h>
>  #include <cxlmem.h>
> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>  		if (cxlr)
>  			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>  
> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> +			return;
> +
>  		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>  			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>  						&evt->gen_media);
> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>  
> +struct cxl_mce_record {
> +	struct list_head node;
> +	u64 hpa;
> +};
> +LIST_HEAD(cxl_mce_records);
> +DEFINE_MUTEX(cxl_mce_mutex);
> +
> +bool cxl_mce_recorded(u64 hpa)
> +{
> +	struct cxl_mce_record *cur, *next, *rec;
> +	int rc;
> +
> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);

guard(mutex)(&cxl_mce_muted);

> +	if (rc)
> +		return false;
> +
> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> +		if (cur->hpa == hpa) {
> +			mutex_unlock(&cxl_mce_mutex);
> +			return true;
> +		}
> +	}
> +
> +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> +	rec->hpa = hpa;
> +	list_add(&cxl_mce_records, &rec->node);
> +
> +	mutex_unlock(&cxl_mce_mutex);
> +
> +	return false;
> +}
> +
> +void cxl_mce_clear(u64 hpa)
> +{
> +	struct cxl_mce_record *cur, *next;
> +	int rc;
> +
> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);

Maybe cond_guard().

> +	if (rc)
> +		return;
> +
> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> +		if (cur->hpa == hpa) {
> +			list_del(&cur->node);
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&cxl_mce_mutex);
> +}
> +
> +struct cxl_contains_hpa_context {
> +	bool contains;
> +	u64 hpa;
> +};
> +
> +static int __cxl_contains_hpa(struct device *dev, void *arg)
> +{
> +	struct cxl_contains_hpa_context *ctx = arg;
> +	struct cxl_endpoint_decoder *cxled;
> +	struct range *range;
> +	u64 hpa = ctx->hpa;
> +
> +	if (!is_endpoint_decoder(dev))
> +		return 0;
> +
> +	cxled = to_cxl_endpoint_decoder(dev);
> +	range = &cxled->cxld.hpa_range;
> +
> +	if (range->start <= hpa && hpa <= range->end) {
> +		ctx->contains = true;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
> +{
> +	struct cxl_contains_hpa_context ctx = {
> +		.contains = false,
> +		.hpa = hpa,
> +	};
> +	struct cxl_port *port;
> +
> +	port = cxlmd->endpoint;
> +	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
> +		device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa);
> +
> +	return ctx.contains;
> +}
> +
> +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
> +			  void *data)
> +{
> +	struct mce *mce = (struct mce *)data;
> +	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
> +						    mce_notifier);
> +	u64 hpa;
> +
> +	if (!mce || !mce_usable_address(mce))
> +		return NOTIFY_DONE;
> +
> +	hpa = mce->addr & MCI_ADDR_PHYSADDR;
> +
> +	/* Check if the PFN is located on this CXL device */
> +	if (!pfn_valid(hpa >> PAGE_SHIFT) &&
> +	    !cxl_contains_hpa(mds->cxlds.cxlmd, hpa))
> +		return NOTIFY_DONE;
> +
> +	/*
> +	 * Search PFN in the cxl_mce_records, if already exists, don't continue
> +	 * to do memory_failure() to avoid a poison address being reported
> +	 * more than once.
> +	 */
> +	if (cxl_mce_recorded(hpa))
> +		return NOTIFY_STOP;
> +	else
> +		return NOTIFY_OK;
> +}
> +
>  struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>  {
>  	struct cxl_memdev_state *mds;
> @@ -1427,6 +1553,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>  	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
>  	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
>  
> +	mds->mce_notifier.notifier_call = cxl_handle_mce;
> +	mds->mce_notifier.priority = MCE_PRIO_CXL;
> +	mce_register_decode_chain(&mds->mce_notifier);
> +
>  	return mds;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 0277726afd04..aa3ac89d17be 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -376,10 +376,14 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
>  		goto out;
>  
>  	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> -	if (cxlr)
> +	if (cxlr) {
> +		u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> +
> +		cxl_mce_clear(hpa);
>  		dev_warn_once(mds->cxlds.dev,
>  			      "poison clear dpa:%#llx region: %s\n", dpa,
>  			      dev_name(&cxlr->dev));
> +	}
>  
>  	record = (struct cxl_poison_record) {
>  		.address = cpu_to_le64(dpa),
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 19aba81cdf13..fbf8d9f46984 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -501,6 +501,7 @@ struct cxl_memdev_state {
>  	struct cxl_fw_state fw;
>  
>  	struct rcuwait mbox_wait;
> +	struct notifier_block mce_notifier;
>  	int (*mbox_send)(struct cxl_memdev_state *mds,
>  			 struct cxl_mbox_cmd *cmd);
>  };
> @@ -836,6 +837,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
>  int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
>  int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
>  int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
> +bool cxl_mce_recorded(u64 pfn);
> +void cxl_mce_clear(u64 pfn);
>  
>  #ifdef CONFIG_CXL_SUSPEND
>  void cxl_mem_active_inc(void);
Shiyang Ruan June 21, 2024, 10:16 a.m. UTC | #5
在 2024/6/21 1:02, Jonathan Cameron 写道:
> On Wed, 19 Jun 2024 00:53:10 +0800
> Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
> 
>> Background:
>> Since CXL device is a memory device, while CPU consumes a poison page of
>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>> which-First path is configured.  This is the first report.  Then
>> currently, in FW-First path, the poison event is transferred according
>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>   -> CPER -> trace report.  This is the second one.  These two reports
>> are indicating the same poisoning page, which is the so-called "duplicate
>> report"[1].  And the memory_failure() handling I'm trying to add in
>> OS-First path could also be another duplicate report.
>>
>> Hope the flow below could make it easier to understand:
>> CPU accesses bad memory on CXL device, then
>>   -> MCE (INT18), *always* report (1)
>>   -> * FW-First (implemented now)
>>        -> CXL device -> FW
>> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>      * OS-First (not implemented yet, I'm working on it)
>>        -> CXL device -> MSI
>> 	      -> OS:CXL driver -> memory_failure() (2.b)
>> so, the (1) and (2.a/b) are duplicated.
>>
>> (I didn't get response in my reply for [1] while I have to make patch to
>> solve this problem, so please correct me if my understanding is wrong.)
>>
>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>> to check whether the current poison page has been reported (if yes,
>> stop the notifier chain, won't call the following memory_failure()
>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>> page already handled(recorded and reported) in (1) or (2), the other one
>> won't duplicate the report.  The record could be clear when
>> cxl_clear_poison() is called.
>>
>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> 
> So poison can be cleared in a number of ways and a CXL poison clear command
> is unfortunately only one of them.  Some architectures have instructions
> that guarantee to write a whole cacheline and can clear things as well.
> I believe x86 does for starters.

According to the CXL Spec, to clear an error record on device, an 
explicit clear operation is required (I think this means sending a mbox 
command).  I'm not sure if it is able to clear device error by just 
writing a whole cacheline.

> 
> +CC linux-edac and related maintainers / reviewers.
>      linux-mm and hwpoison maintainer.
> 
> So I think this needs a more general solution that encompasses
> more general cleanup of poison.
> 
> Trivial comments inline.

Thanks

> 
> Jonathan
> 
> 
>> ---
>>   arch/x86/include/asm/mce.h |   1 +
>>   drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>>   drivers/cxl/core/memdev.c  |   6 +-
>>   drivers/cxl/cxlmem.h       |   3 +
>>   4 files changed, 139 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>> index dfd2e9699bd7..d8109c48e7d9 100644
>> --- a/arch/x86/include/asm/mce.h
>> +++ b/arch/x86/include/asm/mce.h
>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>   	MCE_PRIO_NFIT,
>>   	MCE_PRIO_EXTLOG,
>>   	MCE_PRIO_UC,
>> +	MCE_PRIO_CXL,
>>   	MCE_PRIO_EARLY,
>>   	MCE_PRIO_CEC,
>>   	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>> index 2626f3fff201..0eb3c5401e81 100644
>> --- a/drivers/cxl/core/mbox.c
>> +++ b/drivers/cxl/core/mbox.c
>> @@ -4,6 +4,8 @@
>>   #include <linux/debugfs.h>
>>   #include <linux/ktime.h>
>>   #include <linux/mutex.h>
>> +#include <linux/notifier.h>
>> +#include <asm/mce.h>
>>   #include <asm/unaligned.h>
>>   #include <cxlpci.h>
>>   #include <cxlmem.h>
>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>>   		if (cxlr)
>>   			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>>   
>> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
>> +			return;
>> +
>>   		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>>   			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>>   						&evt->gen_media);
>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>>   }
>>   EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>>   
>> +struct cxl_mce_record {
>> +	struct list_head node;
>> +	u64 hpa;
>> +};
>> +LIST_HEAD(cxl_mce_records);
>> +DEFINE_MUTEX(cxl_mce_mutex);
>> +
>> +bool cxl_mce_recorded(u64 hpa)
>> +{
>> +	struct cxl_mce_record *cur, *next, *rec;
>> +	int rc;
>> +
>> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> 
> guard(mutex)(&cxl_mce_muted);
> 
>> +	if (rc)
>> +		return false;
>> +
>> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
>> +		if (cur->hpa == hpa) {
>> +			mutex_unlock(&cxl_mce_mutex);
>> +			return true;
>> +		}
>> +	}
>> +
>> +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
>> +	rec->hpa = hpa;
>> +	list_add(&cxl_mce_records, &rec->node);
>> +
>> +	mutex_unlock(&cxl_mce_mutex);
>> +
>> +	return false;
>> +}
>> +
>> +void cxl_mce_clear(u64 hpa)
>> +{
>> +	struct cxl_mce_record *cur, *next;
>> +	int rc;
>> +
>> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> 
> Maybe cond_guard().

Ok, this is better.  I'll use automatic clean locks instead.


--
Thanks,
Ruan.

> 
>> +	if (rc)
>> +		return;
>> +
>> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
>> +		if (cur->hpa == hpa) {
>> +			list_del(&cur->node);
>> +			break;
>> +		}
>> +	}
>> +
>> +	mutex_unlock(&cxl_mce_mutex);
>> +}
>> +
>> +struct cxl_contains_hpa_context {
>> +	bool contains;
>> +	u64 hpa;
>> +};
>> +
>> +static int __cxl_contains_hpa(struct device *dev, void *arg)
>> +{
>> +	struct cxl_contains_hpa_context *ctx = arg;
>> +	struct cxl_endpoint_decoder *cxled;
>> +	struct range *range;
>> +	u64 hpa = ctx->hpa;
>> +
>> +	if (!is_endpoint_decoder(dev))
>> +		return 0;
>> +
>> +	cxled = to_cxl_endpoint_decoder(dev);
>> +	range = &cxled->cxld.hpa_range;
>> +
>> +	if (range->start <= hpa && hpa <= range->end) {
>> +		ctx->contains = true;
>> +		return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
>> +{
>> +	struct cxl_contains_hpa_context ctx = {
>> +		.contains = false,
>> +		.hpa = hpa,
>> +	};
>> +	struct cxl_port *port;
>> +
>> +	port = cxlmd->endpoint;
>> +	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
>> +		device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa);
>> +
>> +	return ctx.contains;
>> +}
>> +
>> +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
>> +			  void *data)
>> +{
>> +	struct mce *mce = (struct mce *)data;
>> +	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
>> +						    mce_notifier);
>> +	u64 hpa;
>> +
>> +	if (!mce || !mce_usable_address(mce))
>> +		return NOTIFY_DONE;
>> +
>> +	hpa = mce->addr & MCI_ADDR_PHYSADDR;
>> +
>> +	/* Check if the PFN is located on this CXL device */
>> +	if (!pfn_valid(hpa >> PAGE_SHIFT) &&
>> +	    !cxl_contains_hpa(mds->cxlds.cxlmd, hpa))
>> +		return NOTIFY_DONE;
>> +
>> +	/*
>> +	 * Search PFN in the cxl_mce_records, if already exists, don't continue
>> +	 * to do memory_failure() to avoid a poison address being reported
>> +	 * more than once.
>> +	 */
>> +	if (cxl_mce_recorded(hpa))
>> +		return NOTIFY_STOP;
>> +	else
>> +		return NOTIFY_OK;
>> +}
>> +
>>   struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>>   {
>>   	struct cxl_memdev_state *mds;
>> @@ -1427,6 +1553,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
>>   	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
>>   	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
>>   
>> +	mds->mce_notifier.notifier_call = cxl_handle_mce;
>> +	mds->mce_notifier.priority = MCE_PRIO_CXL;
>> +	mce_register_decode_chain(&mds->mce_notifier);
>> +
>>   	return mds;
>>   }
>>   EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
>> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
>> index 0277726afd04..aa3ac89d17be 100644
>> --- a/drivers/cxl/core/memdev.c
>> +++ b/drivers/cxl/core/memdev.c
>> @@ -376,10 +376,14 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
>>   		goto out;
>>   
>>   	cxlr = cxl_dpa_to_region(cxlmd, dpa);
>> -	if (cxlr)
>> +	if (cxlr) {
>> +		u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>> +
>> +		cxl_mce_clear(hpa);
>>   		dev_warn_once(mds->cxlds.dev,
>>   			      "poison clear dpa:%#llx region: %s\n", dpa,
>>   			      dev_name(&cxlr->dev));
>> +	}
>>   
>>   	record = (struct cxl_poison_record) {
>>   		.address = cpu_to_le64(dpa),
>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>> index 19aba81cdf13..fbf8d9f46984 100644
>> --- a/drivers/cxl/cxlmem.h
>> +++ b/drivers/cxl/cxlmem.h
>> @@ -501,6 +501,7 @@ struct cxl_memdev_state {
>>   	struct cxl_fw_state fw;
>>   
>>   	struct rcuwait mbox_wait;
>> +	struct notifier_block mce_notifier;
>>   	int (*mbox_send)(struct cxl_memdev_state *mds,
>>   			 struct cxl_mbox_cmd *cmd);
>>   };
>> @@ -836,6 +837,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
>>   int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
>>   int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
>>   int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
>> +bool cxl_mce_recorded(u64 pfn);
>> +void cxl_mce_clear(u64 pfn);
>>   
>>   #ifdef CONFIG_CXL_SUSPEND
>>   void cxl_mem_active_inc(void);
>
Shiyang Ruan June 21, 2024, 10:18 a.m. UTC | #6
在 2024/6/20 23:51, Dave Jiang 写道:
> 
> 
> On 6/19/24 2:24 AM, Shiyang Ruan wrote:
>>
>>
>> 在 2024/6/19 7:35, Dave Jiang 写道:
>>>
>>>
>>> On 6/18/24 9:53 AM, Shiyang Ruan wrote:
>>>> Background:
>>>> Since CXL device is a memory device, while CPU consumes a poison page of
>>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>>>> which-First path is configured.  This is the first report.  Then
>>>> currently, in FW-First path, the poison event is transferred according
>>>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>>>    -> CPER -> trace report.  This is the second one.  These two reports
>>>> are indicating the same poisoning page, which is the so-called "duplicate
>>>> report"[1].  And the memory_failure() handling I'm trying to add in
>>>> OS-First path could also be another duplicate report.
>>>>
>>>> Hope the flow below could make it easier to understand:
>>>> CPU accesses bad memory on CXL device, then
>>>>    -> MCE (INT18), *always* report (1)
>>>>    -> * FW-First (implemented now)
>>>>         -> CXL device -> FW
>>>>            -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>>>       * OS-First (not implemented yet, I'm working on it)
>>>>         -> CXL device -> MSI
>>>>            -> OS:CXL driver -> memory_failure() (2.b)
>>>> so, the (1) and (2.a/b) are duplicated.
>>>>
>>>> (I didn't get response in my reply for [1] while I have to make patch to
>>>> solve this problem, so please correct me if my understanding is wrong.)
>>>>
>>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>>>> to check whether the current poison page has been reported (if yes,
>>>> stop the notifier chain, won't call the following memory_failure()
>>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>>>> page already handled(recorded and reported) in (1) or (2), the other one
>>>> won't duplicate the report.  The record could be clear when
>>>> cxl_clear_poison() is called.
>>>>
>>>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>>>
>>
>> ...
>>
>>>> +
>>>> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
>>>> +{
>>>> +    struct cxl_contains_hpa_context ctx = {
>>>> +        .contains = false,
>>>> +        .hpa = hpa,
>>>> +    };
>>>> +    struct cxl_port *port;
>>>> +
>>>> +    port = cxlmd->endpoint;
>>>> +    if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
>>>
>>> Maybe no need to check is_cxl_endpoint() if the port is retrieved from cxlmd->endpoint.
>>
>> OK, I'll remove this.
>>
>>>
>>> Also, in order to use cxl_num_decoders_committed(), cxl_region_rwsem must be held. See the lockdep_assert_held() in the function. Maybe add a
>>> guard(cxl_regoin_rwsem);
>>> before the if statement above.
>>
>> Got it.  I didn't realize it before.  Will add it.
>>
>>
>> BTW, may I have your opinion on this proposal?  I'm not sure if the Background and problem described above are correct or not.  If not, it could lead me in the wrong direction.
> 
> Patch looks ok to me, but I'm no RAS expert in this area. Lets wait for comments from Jonathan and Dan.

Thanks!

--
Ruan.

>>
>> Thank you very much!
>>
>>
>> -- 
>> Ruan.
>>
>>>
>>> DJ
>>>
Jonathan Cameron June 21, 2024, 5:21 p.m. UTC | #7
On Fri, 21 Jun 2024 18:16:33 +0800
Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:

> 在 2024/6/21 1:02, Jonathan Cameron 写道:
> > On Wed, 19 Jun 2024 00:53:10 +0800
> > Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
> >   
> >> Background:
> >> Since CXL device is a memory device, while CPU consumes a poison page of
> >> CXL device, it always triggers a MCE by interrupt (INT18), no matter
> >> which-First path is configured.  This is the first report.  Then
> >> currently, in FW-First path, the poison event is transferred according
> >> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES  
> >>   -> CPER -> trace report.  This is the second one.  These two reports  
> >> are indicating the same poisoning page, which is the so-called "duplicate
> >> report"[1].  And the memory_failure() handling I'm trying to add in
> >> OS-First path could also be another duplicate report.
> >>
> >> Hope the flow below could make it easier to understand:
> >> CPU accesses bad memory on CXL device, then  
> >>   -> MCE (INT18), *always* report (1)
> >>   -> * FW-First (implemented now)
> >>        -> CXL device -> FW
> >> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)  
> >>      * OS-First (not implemented yet, I'm working on it)  
> >>        -> CXL device -> MSI
> >> 	      -> OS:CXL driver -> memory_failure() (2.b)  
> >> so, the (1) and (2.a/b) are duplicated.
> >>
> >> (I didn't get response in my reply for [1] while I have to make patch to
> >> solve this problem, so please correct me if my understanding is wrong.)
> >>
> >> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> >> to check whether the current poison page has been reported (if yes,
> >> stop the notifier chain, won't call the following memory_failure()
> >> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> >> page already handled(recorded and reported) in (1) or (2), the other one
> >> won't duplicate the report.  The record could be clear when
> >> cxl_clear_poison() is called.
> >>
> >> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> >>
> >> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>  
> > 
> > So poison can be cleared in a number of ways and a CXL poison clear command
> > is unfortunately only one of them.  Some architectures have instructions
> > that guarantee to write a whole cacheline and can clear things as well.
> > I believe x86 does for starters.  
> 
> According to the CXL Spec, to clear an error record on device, an 
> explicit clear operation is required (I think this means sending a mbox 
> command).  I'm not sure if it is able to clear device error by just 
> writing a whole cacheline.
> 

Please give a spec reference.  The only one I'm immediately seeing is
in 8.3.9.9.4.1 Get Poison List (opcode 43000h)
which says
"When poison is cleared"
but doesn't talk about how.

For TSP cases Clear poison is not allowed, so if they want to clear it
they will have to do it a suitable CPU arch approach not that command
(which may not be implemented in a given device - I gather it is
 awkward to do and a backdoor from control path to datapath isn't
 a popular feature!).

+CC John Groves.  John, any info you can share on whether you expect all
devices with a poison list to support the clear poison command?



> > 
> > +CC linux-edac and related maintainers / reviewers.
> >      linux-mm and hwpoison maintainer.
> > 
> > So I think this needs a more general solution that encompasses
> > more general cleanup of poison.
> > 
> > Trivial comments inline.  
> 
> Thanks
> 
> > 
> > Jonathan
> > 
> >   
> >> ---
> >>   arch/x86/include/asm/mce.h |   1 +
> >>   drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
> >>   drivers/cxl/core/memdev.c  |   6 +-
> >>   drivers/cxl/cxlmem.h       |   3 +
> >>   4 files changed, 139 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> >> index dfd2e9699bd7..d8109c48e7d9 100644
> >> --- a/arch/x86/include/asm/mce.h
> >> +++ b/arch/x86/include/asm/mce.h
> >> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
> >>   	MCE_PRIO_NFIT,
> >>   	MCE_PRIO_EXTLOG,
> >>   	MCE_PRIO_UC,
> >> +	MCE_PRIO_CXL,
> >>   	MCE_PRIO_EARLY,
> >>   	MCE_PRIO_CEC,
> >>   	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> >> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> >> index 2626f3fff201..0eb3c5401e81 100644
> >> --- a/drivers/cxl/core/mbox.c
> >> +++ b/drivers/cxl/core/mbox.c
> >> @@ -4,6 +4,8 @@
> >>   #include <linux/debugfs.h>
> >>   #include <linux/ktime.h>
> >>   #include <linux/mutex.h>
> >> +#include <linux/notifier.h>
> >> +#include <asm/mce.h>
> >>   #include <asm/unaligned.h>
> >>   #include <cxlpci.h>
> >>   #include <cxlmem.h>
> >> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> >>   		if (cxlr)
> >>   			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> >>   
> >> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> >> +			return;
> >> +
> >>   		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
> >>   			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
> >>   						&evt->gen_media);
> >> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
> >>   }
> >>   EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
> >>   
> >> +struct cxl_mce_record {
> >> +	struct list_head node;
> >> +	u64 hpa;
> >> +};
> >> +LIST_HEAD(cxl_mce_records);
> >> +DEFINE_MUTEX(cxl_mce_mutex);
> >> +
> >> +bool cxl_mce_recorded(u64 hpa)
> >> +{
> >> +	struct cxl_mce_record *cur, *next, *rec;
> >> +	int rc;
> >> +
> >> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);  
> > 
> > guard(mutex)(&cxl_mce_muted);
> >   
> >> +	if (rc)
> >> +		return false;
> >> +
> >> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> >> +		if (cur->hpa == hpa) {
> >> +			mutex_unlock(&cxl_mce_mutex);
> >> +			return true;
> >> +		}
> >> +	}
> >> +
> >> +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> >> +	rec->hpa = hpa;
> >> +	list_add(&cxl_mce_records, &rec->node);
> >> +
> >> +	mutex_unlock(&cxl_mce_mutex);
> >> +
> >> +	return false;
> >> +}
> >> +
> >> +void cxl_mce_clear(u64 hpa)
> >> +{
> >> +	struct cxl_mce_record *cur, *next;
> >> +	int rc;
> >> +
> >> +	rc = mutex_lock_interruptible(&cxl_mce_mutex);  
> > 
> > Maybe cond_guard().  
> 
> Ok, this is better.  I'll use automatic clean locks instead.
> 
> 
> --
> Thanks,
> Ruan.
> 
> >   
> >> +	if (rc)
> >> +		return;
> >> +
> >> +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> >> +		if (cur->hpa == hpa) {
> >> +			list_del(&cur->node);
> >> +			break;
> >> +		}
> >> +	}
> >> +
> >> +	mutex_unlock(&cxl_mce_mutex);
> >> +}
> >> +
> >> +struct cxl_contains_hpa_context {
> >> +	bool contains;
> >> +	u64 hpa;
> >> +};
> >> +
> >> +static int __cxl_contains_hpa(struct device *dev, void *arg)
> >> +{
> >> +	struct cxl_contains_hpa_context *ctx = arg;
> >> +	struct cxl_endpoint_decoder *cxled;
> >> +	struct range *range;
> >> +	u64 hpa = ctx->hpa;
> >> +
> >> +	if (!is_endpoint_decoder(dev))
> >> +		return 0;
> >> +
> >> +	cxled = to_cxl_endpoint_decoder(dev);
> >> +	range = &cxled->cxld.hpa_range;
> >> +
> >> +	if (range->start <= hpa && hpa <= range->end) {
> >> +		ctx->contains = true;
> >> +		return 1;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
> >> +{
> >> +	struct cxl_contains_hpa_context ctx = {
> >> +		.contains = false,
> >> +		.hpa = hpa,
> >> +	};
> >> +	struct cxl_port *port;
> >> +
> >> +	port = cxlmd->endpoint;
> >> +	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
> >> +		device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa);
> >> +
> >> +	return ctx.contains;
> >> +}
> >> +
> >> +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
> >> +			  void *data)
> >> +{
> >> +	struct mce *mce = (struct mce *)data;
> >> +	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
> >> +						    mce_notifier);
> >> +	u64 hpa;
> >> +
> >> +	if (!mce || !mce_usable_address(mce))
> >> +		return NOTIFY_DONE;
> >> +
> >> +	hpa = mce->addr & MCI_ADDR_PHYSADDR;
> >> +
> >> +	/* Check if the PFN is located on this CXL device */
> >> +	if (!pfn_valid(hpa >> PAGE_SHIFT) &&
> >> +	    !cxl_contains_hpa(mds->cxlds.cxlmd, hpa))
> >> +		return NOTIFY_DONE;
> >> +
> >> +	/*
> >> +	 * Search PFN in the cxl_mce_records, if already exists, don't continue
> >> +	 * to do memory_failure() to avoid a poison address being reported
> >> +	 * more than once.
> >> +	 */
> >> +	if (cxl_mce_recorded(hpa))
> >> +		return NOTIFY_STOP;
> >> +	else
> >> +		return NOTIFY_OK;
> >> +}
> >> +
> >>   struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
> >>   {
> >>   	struct cxl_memdev_state *mds;
> >> @@ -1427,6 +1553,10 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
> >>   	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
> >>   	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
> >>   
> >> +	mds->mce_notifier.notifier_call = cxl_handle_mce;
> >> +	mds->mce_notifier.priority = MCE_PRIO_CXL;
> >> +	mce_register_decode_chain(&mds->mce_notifier);
> >> +
> >>   	return mds;
> >>   }
> >>   EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
> >> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> >> index 0277726afd04..aa3ac89d17be 100644
> >> --- a/drivers/cxl/core/memdev.c
> >> +++ b/drivers/cxl/core/memdev.c
> >> @@ -376,10 +376,14 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
> >>   		goto out;
> >>   
> >>   	cxlr = cxl_dpa_to_region(cxlmd, dpa);
> >> -	if (cxlr)
> >> +	if (cxlr) {
> >> +		u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> >> +
> >> +		cxl_mce_clear(hpa);
> >>   		dev_warn_once(mds->cxlds.dev,
> >>   			      "poison clear dpa:%#llx region: %s\n", dpa,
> >>   			      dev_name(&cxlr->dev));
> >> +	}
> >>   
> >>   	record = (struct cxl_poison_record) {
> >>   		.address = cpu_to_le64(dpa),
> >> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> >> index 19aba81cdf13..fbf8d9f46984 100644
> >> --- a/drivers/cxl/cxlmem.h
> >> +++ b/drivers/cxl/cxlmem.h
> >> @@ -501,6 +501,7 @@ struct cxl_memdev_state {
> >>   	struct cxl_fw_state fw;
> >>   
> >>   	struct rcuwait mbox_wait;
> >> +	struct notifier_block mce_notifier;
> >>   	int (*mbox_send)(struct cxl_memdev_state *mds,
> >>   			 struct cxl_mbox_cmd *cmd);
> >>   };
> >> @@ -836,6 +837,8 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> >>   int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
> >>   int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
> >>   int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
> >> +bool cxl_mce_recorded(u64 pfn);
> >> +void cxl_mce_clear(u64 pfn);
> >>   
> >>   #ifdef CONFIG_CXL_SUSPEND
> >>   void cxl_mem_active_inc(void);  
> >
Dan Williams June 21, 2024, 5:51 p.m. UTC | #8
Shiyang Ruan wrote:
> Background:
> Since CXL device is a memory device, while CPU consumes a poison page of 
> CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> which-First path is configured.  This is the first report.  Then 
> currently, in FW-First path, the poison event is transferred according 
> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
>  -> CPER -> trace report.  This is the second one.  These two reports
> are indicating the same poisoning page, which is the so-called "duplicate
> report"[1].  And the memory_failure() handling I'm trying to add in
> OS-First path could also be another duplicate report.
> 
> Hope the flow below could make it easier to understand:
> CPU accesses bad memory on CXL device, then
>  -> MCE (INT18), *always* report (1)
>  -> * FW-First (implemented now)
>       -> CXL device -> FW
> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>     * OS-First (not implemented yet, I'm working on it)
>       -> CXL device -> MSI
> 	      -> OS:CXL driver -> memory_failure() (2.b)
> so, the (1) and (2.a/b) are duplicated.
> 
> (I didn't get response in my reply for [1] while I have to make patch to
> solve this problem, so please correct me if my understanding is wrong.)

The CPU MCE may not be in the loop. Consider the case of patrol scrub,
or device-DMA accessing poison. In that case the device will signal a
component event and the CPU may never issue the MCE.

What is missing for me from this description is *why* does the duplicate
report matter in practice? If all that happens is that the kernel
repeats the lookup to offline the page and set the HWPoison bit, is that
duplicated work worth adding more tracking?

> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> to check whether the current poison page has been reported (if yes,
> stop the notifier chain, won't call the following memory_failure()
> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> page already handled(recorded and reported) in (1) or (2), the other one
> won't duplicate the report.  The record could be clear when
> cxl_clear_poison() is called.
> 
> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  arch/x86/include/asm/mce.h |   1 +
>  drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>  drivers/cxl/core/memdev.c  |   6 +-
>  drivers/cxl/cxlmem.h       |   3 +
>  4 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index dfd2e9699bd7..d8109c48e7d9 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>  	MCE_PRIO_NFIT,
>  	MCE_PRIO_EXTLOG,
>  	MCE_PRIO_UC,
> +	MCE_PRIO_CXL,
>  	MCE_PRIO_EARLY,
>  	MCE_PRIO_CEC,
>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 2626f3fff201..0eb3c5401e81 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -4,6 +4,8 @@
>  #include <linux/debugfs.h>
>  #include <linux/ktime.h>
>  #include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <asm/mce.h>
>  #include <asm/unaligned.h>
>  #include <cxlpci.h>
>  #include <cxlmem.h>
> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>  		if (cxlr)
>  			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>  
> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> +			return;
> +
>  		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>  			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>  						&evt->gen_media);
> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>  
> +struct cxl_mce_record {
> +	struct list_head node;
> +	u64 hpa;
> +};
> +LIST_HEAD(cxl_mce_records);
> +DEFINE_MUTEX(cxl_mce_mutex);

I would recommend an xarray for this use case as that already has its
own internal locking and efficient memory allocation for new nodes.

However, the "why" question needs to be answered first.
Dan Williams June 21, 2024, 5:59 p.m. UTC | #9
Jonathan Cameron wrote:
> On Wed, 19 Jun 2024 00:53:10 +0800
> Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
> 
> > Background:
> > Since CXL device is a memory device, while CPU consumes a poison page of 
> > CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> > which-First path is configured.  This is the first report.  Then 
> > currently, in FW-First path, the poison event is transferred according 
> > to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
> >  -> CPER -> trace report.  This is the second one.  These two reports  
> > are indicating the same poisoning page, which is the so-called "duplicate
> > report"[1].  And the memory_failure() handling I'm trying to add in
> > OS-First path could also be another duplicate report.
> > 
> > Hope the flow below could make it easier to understand:
> > CPU accesses bad memory on CXL device, then
> >  -> MCE (INT18), *always* report (1)
> >  -> * FW-First (implemented now)
> >       -> CXL device -> FW
> > 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)  
> >     * OS-First (not implemented yet, I'm working on it)
> >       -> CXL device -> MSI
> > 	      -> OS:CXL driver -> memory_failure() (2.b)  
> > so, the (1) and (2.a/b) are duplicated.
> > 
> > (I didn't get response in my reply for [1] while I have to make patch to
> > solve this problem, so please correct me if my understanding is wrong.)
> > 
> > This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> > to check whether the current poison page has been reported (if yes,
> > stop the notifier chain, won't call the following memory_failure()
> > to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> > page already handled(recorded and reported) in (1) or (2), the other one
> > won't duplicate the report.  The record could be clear when
> > cxl_clear_poison() is called.
> > 
> > [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> > 
> > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> 
> So poison can be cleared in a number of ways and a CXL poison clear command
> is unfortunately only one of them.  Some architectures have instructions
> that guarantee to write a whole cacheline and can clear things as well.
> I believe x86 does for starters.

Yes, movdir64b.

> +CC linux-edac and related maintainers / reviewers.
>     linux-mm and hwpoison maintainer.
> 
> So I think this needs a more general solution that encompasses 
> more general cleanup of poison.

I think unless the device has "List Poison" coverage for volatile ranges
that the kernel should not worry about tracking this itself.

Perhaps what is needed is that after successful memory_failure()
handling when the page is known to be offline the device backing the
memory can be notified that it is safe to repair the page and but it
back into service, but I expect that would be comparison of the device's
own poison tracking relative to the notification of successful page
offline.

> 
> Trivial comments inline.
> 
> Jonathan
> 
> 
> > ---
> >  arch/x86/include/asm/mce.h |   1 +
> >  drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
> >  drivers/cxl/core/memdev.c  |   6 +-
> >  drivers/cxl/cxlmem.h       |   3 +
> >  4 files changed, 139 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> > index dfd2e9699bd7..d8109c48e7d9 100644
> > --- a/arch/x86/include/asm/mce.h
> > +++ b/arch/x86/include/asm/mce.h
> > @@ -182,6 +182,7 @@ enum mce_notifier_prios {
> >  	MCE_PRIO_NFIT,
> >  	MCE_PRIO_EXTLOG,
> >  	MCE_PRIO_UC,
> > +	MCE_PRIO_CXL,
> >  	MCE_PRIO_EARLY,
> >  	MCE_PRIO_CEC,
> >  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 2626f3fff201..0eb3c5401e81 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -4,6 +4,8 @@
> >  #include <linux/debugfs.h>
> >  #include <linux/ktime.h>
> >  #include <linux/mutex.h>
> > +#include <linux/notifier.h>
> > +#include <asm/mce.h>
> >  #include <asm/unaligned.h>
> >  #include <cxlpci.h>
> >  #include <cxlmem.h>
> > @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> >  		if (cxlr)
> >  			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> >  
> > +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> > +			return;
> > +
> >  		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
> >  			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
> >  						&evt->gen_media);
> > @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
> >  
> > +struct cxl_mce_record {
> > +	struct list_head node;
> > +	u64 hpa;
> > +};
> > +LIST_HEAD(cxl_mce_records);
> > +DEFINE_MUTEX(cxl_mce_mutex);
> > +
> > +bool cxl_mce_recorded(u64 hpa)
> > +{
> > +	struct cxl_mce_record *cur, *next, *rec;
> > +	int rc;
> > +
> > +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> 
> guard(mutex)(&cxl_mce_muted);

Agree, _interruptible is really only suitable for user ABI facing locks,
not kernel internal helper functions, but this comment is moot if this
tracking switches to xarray.

> 
> > +	if (rc)
> > +		return false;
> > +
> > +	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
> > +		if (cur->hpa == hpa) {
> > +			mutex_unlock(&cxl_mce_mutex);
> > +			return true;
> > +		}
> > +	}
> > +
> > +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> > +	rec->hpa = hpa;
> > +	list_add(&cxl_mce_records, &rec->node);
> > +
> > +	mutex_unlock(&cxl_mce_mutex);
> > +
> > +	return false;
> > +}
> > +
> > +void cxl_mce_clear(u64 hpa)
> > +{
> > +	struct cxl_mce_record *cur, *next;
> > +	int rc;
> > +
> > +	rc = mutex_lock_interruptible(&cxl_mce_mutex);
> 
> Maybe cond_guard().

cond_guard() was rejected, you meant scoped_cond_guard()? But, then I
think _interruptible is not appropriate here.
Jonathan Cameron June 21, 2024, 6:45 p.m. UTC | #10
On Fri, 21 Jun 2024 10:59:46 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Jonathan Cameron wrote:
> > On Wed, 19 Jun 2024 00:53:10 +0800
> > Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
> >   
> > > Background:
> > > Since CXL device is a memory device, while CPU consumes a poison page of 
> > > CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> > > which-First path is configured.  This is the first report.  Then 
> > > currently, in FW-First path, the poison event is transferred according 
> > > to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES   
> > >  -> CPER -> trace report.  This is the second one.  These two reports    
> > > are indicating the same poisoning page, which is the so-called "duplicate
> > > report"[1].  And the memory_failure() handling I'm trying to add in
> > > OS-First path could also be another duplicate report.
> > > 
> > > Hope the flow below could make it easier to understand:
> > > CPU accesses bad memory on CXL device, then  
> > >  -> MCE (INT18), *always* report (1)
> > >  -> * FW-First (implemented now)
> > >       -> CXL device -> FW
> > > 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)    
> > >     * OS-First (not implemented yet, I'm working on it)  
> > >       -> CXL device -> MSI
> > > 	      -> OS:CXL driver -> memory_failure() (2.b)    
> > > so, the (1) and (2.a/b) are duplicated.
> > > 
> > > (I didn't get response in my reply for [1] while I have to make patch to
> > > solve this problem, so please correct me if my understanding is wrong.)
> > > 
> > > This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> > > to check whether the current poison page has been reported (if yes,
> > > stop the notifier chain, won't call the following memory_failure()
> > > to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> > > page already handled(recorded and reported) in (1) or (2), the other one
> > > won't duplicate the report.  The record could be clear when
> > > cxl_clear_poison() is called.
> > > 
> > > [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> > > 
> > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>  
> > 
> > So poison can be cleared in a number of ways and a CXL poison clear command
> > is unfortunately only one of them.  Some architectures have instructions
> > that guarantee to write a whole cacheline and can clear things as well.
> > I believe x86 does for starters.  
> 
> Yes, movdir64b.

Equivalent arm64 instruction is not valid to normal memory. Lets say
no more on that :(

So who actually cares about recovering poisoned volatile memory?
I'd like to understand more on how significant a use case this is.
Whilst I can conjecture that its an extreme case of wanting to avoid
loosing the ability to create 1GiB or larger pages due to poison
is that a real problem for anyone today?  Note this is just the case
where you've reached an actual uncorrectable error and probably
/ possibly killed something, not the more common soft offlining
of memory due to correctable errors being detected.

> 
> > +CC linux-edac and related maintainers / reviewers.
> >     linux-mm and hwpoison maintainer.
> > 
> > So I think this needs a more general solution that encompasses 
> > more general cleanup of poison.  
> 
> I think unless the device has "List Poison" coverage for volatile ranges
> that the kernel should not worry about tracking this itself.

Maybe.  I think you can still get a media event for this as well
as synchronous poison so there may be a path to a double report, just
a more timely one hopefully.

> 
> Perhaps what is needed is that after successful memory_failure()
> handling when the page is known to be offline the device backing the
> memory can be notified that it is safe to repair the page and but it
> back into service, but I expect that would be comparison of the device's
> own poison tracking relative to the notification of successful page
> offline.

That would work. Elide the error handling if the page is known to
be offline due to poison.  Might be racey though but does it
really hurt if we occasionally report twice?

> > > +	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> > > +	rec->hpa = hpa;
> > > +	list_add(&cxl_mce_records, &rec->node);
> > > +
> > > +	mutex_unlock(&cxl_mce_mutex);
> > > +
> > > +	return false;
> > > +}
> > > +
> > > +void cxl_mce_clear(u64 hpa)
> > > +{
> > > +	struct cxl_mce_record *cur, *next;
> > > +	int rc;
> > > +
> > > +	rc = mutex_lock_interruptible(&cxl_mce_mutex);  
> > 
> > Maybe cond_guard().  
> 
> cond_guard() was rejected, you meant scoped_cond_guard()? But, then I
> think _interruptible is not appropriate here.

Ah yes.  Indeed scoped_cond_guard() but fair enough on the
interruptible point!


>
Luck, Tony June 21, 2024, 8:44 p.m. UTC | #11
> So who actually cares about recovering poisoned volatile memory?
> I'd like to understand more on how significant a use case this is.
> Whilst I can conjecture that its an extreme case of wanting to avoid
> loosing the ability to create 1GiB or larger pages due to poison
> is that a real problem for anyone today?  Note this is just the case
> where you've reached an actual uncorrectable error and probably
> / possibly killed something, not the more common soft offlining
> of memory due to correctable errors being detected.

I guess you really need a reply from someone with a data center
with thousands of machines, since that's where this question
may be important.

My humble opinion is that, outside of the huge page issue, nobody
should try to recover a poisoned page. Systems that can report
and recover from poison have tens, hundreds, or more GBytes
of memory. Dropping 4K pages will not have any measurable
impact on a system (even if there are hundreds of pages dropped).

There's no reliable way to determine whether the poisoned page
was due to some transient issue, or a permanent defect. Recovering
a poisoned page runs the risk that the poison will re-occur. Perhaps
next use of the page will be in some unrecoverable (kernel) context.

So recovery has some risk, but very little upside benefit.

-Tony
Shiyang Ruan June 25, 2024, 1:56 p.m. UTC | #12
在 2024/6/22 1:51, Dan Williams 写道:
> Shiyang Ruan wrote:
>> Background:
>> Since CXL device is a memory device, while CPU consumes a poison page of
>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>> which-First path is configured.  This is the first report.  Then
>> currently, in FW-First path, the poison event is transferred according
>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>   -> CPER -> trace report.  This is the second one.  These two reports
>> are indicating the same poisoning page, which is the so-called "duplicate
>> report"[1].  And the memory_failure() handling I'm trying to add in
>> OS-First path could also be another duplicate report.
>>
>> Hope the flow below could make it easier to understand:
>> CPU accesses bad memory on CXL device, then
>>   -> MCE (INT18), *always* report (1)
>>   -> * FW-First (implemented now)
>>        -> CXL device -> FW
>> 	      -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>      * OS-First (not implemented yet, I'm working on it)
>>        -> CXL device -> MSI
>> 	      -> OS:CXL driver -> memory_failure() (2.b)
>> so, the (1) and (2.a/b) are duplicated.
>>
>> (I didn't get response in my reply for [1] while I have to make patch to
>> solve this problem, so please correct me if my understanding is wrong.)
> 
> The CPU MCE may not be in the loop. Consider the case of patrol scrub,
> or device-DMA accessing poison. In that case the device will signal a
> component event and the CPU may never issue the MCE.

My other patch: "cxl/core: add poison creation event handler", adds 
calling memory_failure() when an event is received form device, which 
checks the poison record (insert if not exists) to make sure poison 
would be reported/handled, but not twice, no matter CPU issues the MCE 
later or earlier.  And the lock of poison record makes sure that it's 
fine even in race condition.

> 
> What is missing for me from this description is *why* does the duplicate
> report matter in practice? If all that happens is that the kernel
> repeats the lookup to offline the page and set the HWPoison bit, is that
> duplicated work worth adding more tracking?

Besides setting the HWPoison bit for the poison page, memory_failure() 
also finds and notifies those processes who are accessing the poison 
page, and tries to recovery the page.  And there seems no lock to 
prevent the 2nd memory_failure() and clearing poison operation from 
being called in race, then the HWPoison bit could be unsure.  I think 
that's the problem.

 > So, I think all CXL poison notification events should trigger an
 > action optional memory_failure(). I expect this needs to make sure
 > that duplicates re not a problem. I.e. in the case of CPU consumption
 > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED event via
 > the MCE path *and* it may trigger the device to send an error record
 > for the same page. As far as I can see, duplicate reports (MCE + CXL
 > device) are unavoidable.

As you mentioned in my other patch, this problem should be solved at 
first.  My patch adds the poison record (tracking, which might not 
become very large) to prevent duplicating report.

> 
>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>> to check whether the current poison page has been reported (if yes,
>> stop the notifier chain, won't call the following memory_failure()
>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>> page already handled(recorded and reported) in (1) or (2), the other one
>> won't duplicate the report.  The record could be clear when
>> cxl_clear_poison() is called.
>>
>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>> ---
>>   arch/x86/include/asm/mce.h |   1 +
>>   drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>>   drivers/cxl/core/memdev.c  |   6 +-
>>   drivers/cxl/cxlmem.h       |   3 +
>>   4 files changed, 139 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>> index dfd2e9699bd7..d8109c48e7d9 100644
>> --- a/arch/x86/include/asm/mce.h
>> +++ b/arch/x86/include/asm/mce.h
>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>   	MCE_PRIO_NFIT,
>>   	MCE_PRIO_EXTLOG,
>>   	MCE_PRIO_UC,
>> +	MCE_PRIO_CXL,
>>   	MCE_PRIO_EARLY,
>>   	MCE_PRIO_CEC,
>>   	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>> index 2626f3fff201..0eb3c5401e81 100644
>> --- a/drivers/cxl/core/mbox.c
>> +++ b/drivers/cxl/core/mbox.c
>> @@ -4,6 +4,8 @@
>>   #include <linux/debugfs.h>
>>   #include <linux/ktime.h>
>>   #include <linux/mutex.h>
>> +#include <linux/notifier.h>
>> +#include <asm/mce.h>
>>   #include <asm/unaligned.h>
>>   #include <cxlpci.h>
>>   #include <cxlmem.h>
>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>>   		if (cxlr)
>>   			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>>   
>> +		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
>> +			return;
>> +
>>   		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>>   			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>>   						&evt->gen_media);
>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>>   }
>>   EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>>   
>> +struct cxl_mce_record {
>> +	struct list_head node;
>> +	u64 hpa;
>> +};
>> +LIST_HEAD(cxl_mce_records);
>> +DEFINE_MUTEX(cxl_mce_mutex);
> 
> I would recommend an xarray for this use case as that already has its
> own internal locking and efficient memory allocation for new nodes.
> 
> However, the "why" question needs to be answered first.

xarray does look better.  I didn't think of it befre.  Thanks for 
suggestion.

--
Thanks
Ruan.
Shiyang Ruan June 26, 2024, 6:03 a.m. UTC | #13
在 2024/6/22 4:44, Luck, Tony 写道:
>> So who actually cares about recovering poisoned volatile memory?
>> I'd like to understand more on how significant a use case this is.
>> Whilst I can conjecture that its an extreme case of wanting to avoid
>> loosing the ability to create 1GiB or larger pages due to poison
>> is that a real problem for anyone today?  Note this is just the case
>> where you've reached an actual uncorrectable error and probably
>> / possibly killed something, not the more common soft offlining
>> of memory due to correctable errors being detected.
> 
> I guess you really need a reply from someone with a data center
> with thousands of machines, since that's where this question
> may be important.
> 
> My humble opinion is that, outside of the huge page issue, nobody
> should try to recover a poisoned page. Systems that can report
> and recover from poison have tens, hundreds, or more GBytes
> of memory. Dropping 4K pages will not have any measurable
> impact on a system (even if there are hundreds of pages dropped).
> 
> There's no reliable way to determine whether the poisoned page
> was due to some transient issue, or a permanent defect. Recovering
> a poisoned page runs the risk that the poison will re-occur. Perhaps
> next use of the page will be in some unrecoverable (kernel) context.
> 
> So recovery has some risk, but very little upside benefit.

Since the hardware provides the instruction(CPU)/command(CXL) to clear 
the poison, we could make the function work, at least as an optional 
feature.  Then users could decide to use it or not after evaluating the 
risk and benefit.

I think doing recovery is an improvement step, and may need a lot of 
discussion.  I'm not sure if we could reach a conclusion in this thread. 
  Just hope more comments on the original problem (duplicate report) to 
solve in this patch.


--
Thanks,
Ruan.

> 
> -Tony
Luck, Tony June 26, 2024, 3:56 p.m. UTC | #14
>> So recovery has some risk, but very little upside benefit.
>
> Since the hardware provides the instruction(CPU)/command(CXL) to clear 
> the poison, we could make the function work, at least as an optional 
> feature.  Then users could decide to use it or not after evaluating the 
> risk and benefit.
>
> I think doing recovery is an improvement step, and may need a lot of 
> discussion.  I'm not sure if we could reach a conclusion in this thread. 
>  Just hope more comments on the original problem (duplicate report) to 
> solve in this patch.

Post a patch to implement poison clearing/recovery with some way to opt-in
to get the discussion started. If it doesn't add too much complexity perhaps
it will be accepted.

-Tony
Shiyang Ruan July 2, 2024, 2:12 a.m. UTC | #15
在 2024/6/25 21:56, Shiyang Ruan 写道:
> 
> 
> 在 2024/6/22 1:51, Dan Williams 写道:
>> Shiyang Ruan wrote:
>>> Background:
>>> Since CXL device is a memory device, while CPU consumes a poison page of
>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>>> which-First path is configured.  This is the first report.  Then
>>> currently, in FW-First path, the poison event is transferred according
>>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>>   -> CPER -> trace report.  This is the second one.  These two reports
>>> are indicating the same poisoning page, which is the so-called 
>>> "duplicate
>>> report"[1].  And the memory_failure() handling I'm trying to add in
>>> OS-First path could also be another duplicate report.
>>>
>>> Hope the flow below could make it easier to understand:
>>> CPU accesses bad memory on CXL device, then
>>>   -> MCE (INT18), *always* report (1)
>>>   -> * FW-First (implemented now)
>>>        -> CXL device -> FW
>>>           -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>>      * OS-First (not implemented yet, I'm working on it)
>>>        -> CXL device -> MSI
>>>           -> OS:CXL driver -> memory_failure() (2.b)
>>> so, the (1) and (2.a/b) are duplicated.
>>>
>>> (I didn't get response in my reply for [1] while I have to make patch to
>>> solve this problem, so please correct me if my understanding is wrong.)
>>
>> The CPU MCE may not be in the loop. Consider the case of patrol scrub,
>> or device-DMA accessing poison. In that case the device will signal a
>> component event and the CPU may never issue the MCE.
> 
> My other patch: "cxl/core: add poison creation event handler", adds 
> calling memory_failure() when an event is received form device, which 
> checks the poison record (insert if not exists) to make sure poison 
> would be reported/handled, but not twice, no matter CPU issues the MCE 
> later or earlier.  And the lock of poison record makes sure that it's 
> fine even in race condition.
> 
>>
>> What is missing for me from this description is *why* does the duplicate
>> report matter in practice? If all that happens is that the kernel
>> repeats the lookup to offline the page and set the HWPoison bit, is that
>> duplicated work worth adding more tracking?
> 
> Besides setting the HWPoison bit for the poison page, memory_failure() 
> also finds and notifies those processes who are accessing the poison 
> page, and tries to recovery the page.  And there seems no lock to 
> prevent the 2nd memory_failure() and clearing poison operation from 
> being called in race, then the HWPoison bit could be unsure.  I think 
> that's the problem.
> 
>  > So, I think all CXL poison notification events should trigger an
>  > action optional memory_failure(). I expect this needs to make sure
>  > that duplicates re not a problem. I.e. in the case of CPU consumption
>  > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED event via
>  > the MCE path *and* it may trigger the device to send an error record
>  > for the same page. As far as I can see, duplicate reports (MCE + CXL
>  > device) are unavoidable.
> 
> As you mentioned in my other patch, this problem should be solved at 
> first.  My patch adds the poison record (tracking, which might not 
> become very large) to prevent duplicating report.

Ping~

And I had some problems when using xarray, please see below.

> 
>>
>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>>> to check whether the current poison page has been reported (if yes,
>>> stop the notifier chain, won't call the following memory_failure()
>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>>> page already handled(recorded and reported) in (1) or (2), the other one
>>> won't duplicate the report.  The record could be clear when
>>> cxl_clear_poison() is called.
>>>
>>> [1] 
>>> https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>>
>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>> ---
>>>   arch/x86/include/asm/mce.h |   1 +
>>>   drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>>>   drivers/cxl/core/memdev.c  |   6 +-
>>>   drivers/cxl/cxlmem.h       |   3 +
>>>   4 files changed, 139 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>>> index dfd2e9699bd7..d8109c48e7d9 100644
>>> --- a/arch/x86/include/asm/mce.h
>>> +++ b/arch/x86/include/asm/mce.h
>>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>>       MCE_PRIO_NFIT,
>>>       MCE_PRIO_EXTLOG,
>>>       MCE_PRIO_UC,
>>> +    MCE_PRIO_CXL,
>>>       MCE_PRIO_EARLY,
>>>       MCE_PRIO_CEC,
>>>       MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>>> index 2626f3fff201..0eb3c5401e81 100644
>>> --- a/drivers/cxl/core/mbox.c
>>> +++ b/drivers/cxl/core/mbox.c
>>> @@ -4,6 +4,8 @@
>>>   #include <linux/debugfs.h>
>>>   #include <linux/ktime.h>
>>>   #include <linux/mutex.h>
>>> +#include <linux/notifier.h>
>>> +#include <asm/mce.h>
>>>   #include <asm/unaligned.h>
>>>   #include <cxlpci.h>
>>>   #include <cxlmem.h>
>>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct 
>>> cxl_memdev *cxlmd,
>>>           if (cxlr)
>>>               hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>>> +        if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
>>> +            return;
>>> +
>>>           if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>>>               trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>>>                           &evt->gen_media);
>>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct 
>>> cxl_memdev_state *mds)
>>>   }
>>>   EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>>> +struct cxl_mce_record {
>>> +    struct list_head node;
>>> +    u64 hpa;
>>> +};
>>> +LIST_HEAD(cxl_mce_records);
>>> +DEFINE_MUTEX(cxl_mce_mutex);
>>
>> I would recommend an xarray for this use case as that already has its
>> own internal locking and efficient memory allocation for new nodes.
>>
>> However, the "why" question needs to be answered first.
> 
> xarray does look better.  I didn't think of it befre.  Thanks for 
> suggestion.

I'm trying using xarray but but I'm running into two problems.

The first one is: xarray stores an entry with a specified index, but 
here we only need to store the PFN.  I'm not sure how to specific an 
index for a PFN.  So I think xarray might not suitable for my case.
The second one: because we only need to store/search PFN, the list node 
only contains a list_head and a PFN. But xarray seems to require more 
memory for each node, which causes more overhead.

Maybe I'm overthinking this?


--
Thanks
Ruan.

> 
> -- 
> Thanks
> Ruan.
Shiyang Ruan July 19, 2024, 6:24 a.m. UTC | #16
在 2024/6/19 0:53, Shiyang Ruan 写道:
> 
> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> to check whether the current poison page has been reported (if yes,
> stop the notifier chain, won't call the following memory_failure()
> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> page already handled(recorded and reported) in (1) or (2), the other one
> won't duplicate the report.  The record could be clear when
> cxl_clear_poison() is called.

Hi guys,

I'd like to sort out the work I am currently carrying forward, to make 
sure I'm not going in the wrong direction. Please correct me if anything 
is wrong.

As is known to us, CXL spec defines POISON feature to notify its status 
when CXL memory device got a broken page.  Basically, there are two 
major paths for the notification.

1. CPU handling error
   When a process is accessing this broken page, CXL device returns data
   with POISON.  When CPU consumes the POISON, it raises a kind of error
   notification.
   To be precise, "how CPU should behave when it consumes POISON" is
   architecture dependent.  In my understanding, x86-64 raises Machine
   Check Exception(MCE) via interrupt #18 in this case.
2. CXL device reporting error
   When CXL device detects the broken page by itself and sends memory
   error signal to kernel in two optional paths.
   2.a. FW-First
     CXL device sends error via VDM to CXL Host, then CXL Host sends it
     to System Firmware via interrupt, finally kernel handles the error.
   2.b. OS-First
     CXL device directly sends error via MSI/MSI-X to kernel.

Note: Since I'm now focusing on x86_64, basically I'll describe about 
x86-64 only.

The following diagram should describe the 2 major paths and 2 optional 
sub-paths above.
```
1.  MCE (interrupt #18, while CPU consuming POISON)
     -> do_machine_check()
       -> mce_log()
         -> notify chain (x86_mce_decoder_chain)
           -> memory_failure()
2.a FW-First (optional, CXL device proactively find&report)
     -> CXL device -> Firmware
       -> OS: ACPI->APEI->GHES->CPER -> CXL driver -> trace
2.b OS-First (optional, CXL device proactively find&report)
     -> CXL device -> MSI
       -> OS: CXL driver -> trace
```

For "1. CPU handling error" path, the current code seems to work fine. 
When I used error injection feature on QEMU emulation, the code path is 
executed certainly.  Then, if the CPU certainly raises a MCE when it 
consumes the POISON, this path has no problem.

So, I'm working on making for 2.a and 2.b path, which is CXL device 
reported POISON error could be handled by kernel.  This path has two 
advantages.

- Proactively find&report memory problems

   Even if a process does not read data yet, kernel/drivers can prevent
   the process from using corrupted data proactively.  AFAIK, the current
   kernel only traces POISON error event from FW-First/OS-First path, but
   it doesn't handle them, neither notify processes who are using the
   POISON page like MCE does.  User space tools like rasdaemon reads the
   trace and log it, but as well, it doesn't handle the POISON page.  As
   a result, user has to read the error log from rasdaemon, distinguish
   whether the POISON error is from CXL memory or DDR memory, find out
   which applications are effected.  That is not an easy work and cannot
   be handled in time.  Thus, I'd like to add a feature to make the work
   done automatically and quickly. Once CXL device reports the POISON
   error (via FW-First/OS-First), kernel handles it immediately, similar
   to the flow when a MCE is triggered.  This is my first motivation.

- Architecture independent

   As the mentioned above, "1. CPU handling error" path is architecture
   dependent.  On the other hand, this route can be architecture
   independent code.  If there is a CPU which does not have similar
   feature like MCE of x86-64, my work will be essential.  (To be honest,
   I did not notice this advantage at first as mentioned later, but I
   think this is also important.)


Here is the timeline of my development of it.

Two series of patches have been sent so far:
- PATCH: cxl/core: add poison creation event handler [1]
- PATCH: cxl: avoid duplicating report from MCE & device [2]
[1] 
https://lore.kernel.org/linux-cxl/20240417075053.3273543-1-ruansy.fnst@fujitsu.com/
[2] 
https://lore.kernel.org/linux-cxl/20240618165310.877974-1-ruansy.fnst@fujitsu.com/


The 1st patch[1] added POISON error handler in "2. CXL device reporting 
error" path.

My first version was constructing a MCE data from POISON address and 
calling mce_log() to handle the POISON.  But I was told that 
constructing MCE data is architecture dependent while CXL is not.  So, 
in later version, just call memory_failure_queue() in CXL to handle the 
POISON error to avoid the arch-dependent problem.

After many discussions, a new problem was found: as Dan said[3], added 
POISON handling will cause the "duplicate report" problem:
 > So, I think all CXL poison notification events should trigger an
 > action optional memory_failure(). I expect this needs to make sure
 > that duplicates re not a problem. I.e. in the case of CPU consumption
 > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED event via
 > the MCE path *and* it may trigger the device to send an error record
 > for the same page. As far as I can see, duplicate reports (MCE + CXL
 > device) are unavoidable.

[3] 
https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/


To solve this problem, I made the 2nd patch[2].  Allow me to describe 
the background again:

Since CXL device is a memory device, while CPU is consuming a poison 
page of CXL device, it always triggers a MCE (via interrupt #18) and 
calls memory_failure() to handle POISON page, no matter which-First path 
is configured.

My patch added memory_failure() in FW-First/OS-First path: if device 
finds and reports the POISON, kernel not only traces but also calls 
memory_failure() to handle it, marked as "ADD" in the figure blow.
```
1.  MCE (interrupt #18, while CPU consuming POISON)
     -> do_machine_check()
       -> mce_log()
         -> notify chain (x86_mce_decoder_chain)
           -> memory_failure() <---------------------------- EXISTS
2.a FW-First (optional, CXL device proactively find&report)
     -> CXL device -> Firmware
       -> OS: ACPI->APEI->GHES->CPER -> CXL driver -> trace
                                                  \-> memory_failure()
                                                      ^----- ADD
2.b OS-First (optional, CXL device proactively find&report)
     -> CXL device -> MSI
       -> OS: CXL driver -> trace
                        \-> memory_failure()
                            ^------------------------------- ADD
```

But in this way, the memory_failure() could be called twice or even at 
same time, as is shown in the figure above: (1.) and (2.a or 2.b), 
before the POISON page is cleared.  memory_failure() has it own mutex 
lock so it actually won't be called at same time and the later call 
could be avoided because HWPoison bit has been set.  However, assume 
such a scenario, "CXL device reports POISON error" triggers 1st call, 
user see it from log and want to clear the poison by executing `cxl 
clear-poison` command, and at the same time, a process tries to access 
this POISON page, which triggers MCE (it's the 2nd call).  Since there 
is no lock between the 2nd call with clearing poison operation, race 
condition may happen, which may cause HWPoison bit of the page in an 
unknown state.

Thus, we have to avoid the 2nd call. This patch[2] introduces a new 
notifier_block into `x86_mce_decoder_chain` and a POISON cache list, to 
stop the 2nd call of memory_failure(). It checks whether the current 
poison page has been reported (if yes, stop the notifier chain, don't 
call the following memory_failure() to report again).

Looking forward to your comments!


--
Thanks,
Ruan.
Dave Jiang July 19, 2024, 4:04 p.m. UTC | #17
On 7/1/24 7:12 PM, Shiyang Ruan wrote:
> 
> 
> 在 2024/6/25 21:56, Shiyang Ruan 写道:
>>
>>
>> 在 2024/6/22 1:51, Dan Williams 写道:
>>> Shiyang Ruan wrote:
>>>> Background:
>>>> Since CXL device is a memory device, while CPU consumes a poison page of
>>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>>>> which-First path is configured.  This is the first report.  Then
>>>> currently, in FW-First path, the poison event is transferred according
>>>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>>>   -> CPER -> trace report.  This is the second one.  These two reports
>>>> are indicating the same poisoning page, which is the so-called "duplicate
>>>> report"[1].  And the memory_failure() handling I'm trying to add in
>>>> OS-First path could also be another duplicate report.
>>>>
>>>> Hope the flow below could make it easier to understand:
>>>> CPU accesses bad memory on CXL device, then
>>>>   -> MCE (INT18), *always* report (1)
>>>>   -> * FW-First (implemented now)
>>>>        -> CXL device -> FW
>>>>           -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>>>      * OS-First (not implemented yet, I'm working on it)
>>>>        -> CXL device -> MSI
>>>>           -> OS:CXL driver -> memory_failure() (2.b)
>>>> so, the (1) and (2.a/b) are duplicated.
>>>>
>>>> (I didn't get response in my reply for [1] while I have to make patch to
>>>> solve this problem, so please correct me if my understanding is wrong.)
>>>
>>> The CPU MCE may not be in the loop. Consider the case of patrol scrub,
>>> or device-DMA accessing poison. In that case the device will signal a
>>> component event and the CPU may never issue the MCE.
>>
>> My other patch: "cxl/core: add poison creation event handler", adds calling memory_failure() when an event is received form device, which checks the poison record (insert if not exists) to make sure poison would be reported/handled, but not twice, no matter CPU issues the MCE later or earlier.  And the lock of poison record makes sure that it's fine even in race condition.
>>
>>>
>>> What is missing for me from this description is *why* does the duplicate
>>> report matter in practice? If all that happens is that the kernel
>>> repeats the lookup to offline the page and set the HWPoison bit, is that
>>> duplicated work worth adding more tracking?
>>
>> Besides setting the HWPoison bit for the poison page, memory_failure() also finds and notifies those processes who are accessing the poison page, and tries to recovery the page.  And there seems no lock to prevent the 2nd memory_failure() and clearing poison operation from being called in race, then the HWPoison bit could be unsure.  I think that's the problem.
>>
>>  > So, I think all CXL poison notification events should trigger an
>>  > action optional memory_failure(). I expect this needs to make sure
>>  > that duplicates re not a problem. I.e. in the case of CPU consumption
>>  > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED event via
>>  > the MCE path *and* it may trigger the device to send an error record
>>  > for the same page. As far as I can see, duplicate reports (MCE + CXL
>>  > device) are unavoidable.
>>
>> As you mentioned in my other patch, this problem should be solved at first.  My patch adds the poison record (tracking, which might not become very large) to prevent duplicating report.
> 
> Ping~
> 
> And I had some problems when using xarray, please see below.
> 
>>
>>>
>>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>>>> to check whether the current poison page has been reported (if yes,
>>>> stop the notifier chain, won't call the following memory_failure()
>>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>>>> page already handled(recorded and reported) in (1) or (2), the other one
>>>> won't duplicate the report.  The record could be clear when
>>>> cxl_clear_poison() is called.
>>>>
>>>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>>>
>>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>>> ---
>>>>   arch/x86/include/asm/mce.h |   1 +
>>>>   drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>>>>   drivers/cxl/core/memdev.c  |   6 +-
>>>>   drivers/cxl/cxlmem.h       |   3 +
>>>>   4 files changed, 139 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>>>> index dfd2e9699bd7..d8109c48e7d9 100644
>>>> --- a/arch/x86/include/asm/mce.h
>>>> +++ b/arch/x86/include/asm/mce.h
>>>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>>>       MCE_PRIO_NFIT,
>>>>       MCE_PRIO_EXTLOG,
>>>>       MCE_PRIO_UC,
>>>> +    MCE_PRIO_CXL,
>>>>       MCE_PRIO_EARLY,
>>>>       MCE_PRIO_CEC,
>>>>       MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>>>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>>>> index 2626f3fff201..0eb3c5401e81 100644
>>>> --- a/drivers/cxl/core/mbox.c
>>>> +++ b/drivers/cxl/core/mbox.c
>>>> @@ -4,6 +4,8 @@
>>>>   #include <linux/debugfs.h>
>>>>   #include <linux/ktime.h>
>>>>   #include <linux/mutex.h>
>>>> +#include <linux/notifier.h>
>>>> +#include <asm/mce.h>
>>>>   #include <asm/unaligned.h>
>>>>   #include <cxlpci.h>
>>>>   #include <cxlmem.h>
>>>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>>>>           if (cxlr)
>>>>               hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>>>> +        if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
>>>> +            return;
>>>> +
>>>>           if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>>>>               trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>>>>                           &evt->gen_media);
>>>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>>>>   }
>>>>   EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>>>> +struct cxl_mce_record {
>>>> +    struct list_head node;
>>>> +    u64 hpa;
>>>> +};
>>>> +LIST_HEAD(cxl_mce_records);
>>>> +DEFINE_MUTEX(cxl_mce_mutex);
>>>
>>> I would recommend an xarray for this use case as that already has its
>>> own internal locking and efficient memory allocation for new nodes.
>>>
>>> However, the "why" question needs to be answered first.
>>
>> xarray does look better.  I didn't think of it befre.  Thanks for suggestion.
> 
> I'm trying using xarray but but I'm running into two problems.
> 
> The first one is: xarray stores an entry with a specified index, but here we only need to store the PFN.  I'm not sure how to specific an index for a PFN.  So I think xarray might not suitable for my case.
> The second one: because we only need to store/search PFN, the list node only contains a list_head and a PFN. But xarray seems to require more memory for each node, which causes more overhead.

Use PFN as the index of the xarray since it's unique. And that makes finding the entry easy with xarray because you just pass in the PFN as the key. You can store an xarray entry with no data. At this point you are just detecting whether the entry has been stored (valid) or not. Hope that helps.  

> 
> Maybe I'm overthinking this?
> 
> 
> -- 
> Thanks
> Ruan.
> 
>>
>> -- 
>> Thanks
>> Ruan.
Shiyang Ruan July 22, 2024, 7:01 a.m. UTC | #18
在 2024/7/20 0:04, Dave Jiang 写道:
> 
> 
> On 7/1/24 7:12 PM, Shiyang Ruan wrote:
>>
>>
>> 在 2024/6/25 21:56, Shiyang Ruan 写道:
>>>
>>>
>>> 在 2024/6/22 1:51, Dan Williams 写道:
>>>> Shiyang Ruan wrote:
>>>>> Background:
>>>>> Since CXL device is a memory device, while CPU consumes a poison page of
>>>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
>>>>> which-First path is configured.  This is the first report.  Then
>>>>> currently, in FW-First path, the poison event is transferred according
>>>>> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES
>>>>>    -> CPER -> trace report.  This is the second one.  These two reports
>>>>> are indicating the same poisoning page, which is the so-called "duplicate
>>>>> report"[1].  And the memory_failure() handling I'm trying to add in
>>>>> OS-First path could also be another duplicate report.
>>>>>
>>>>> Hope the flow below could make it easier to understand:
>>>>> CPU accesses bad memory on CXL device, then
>>>>>    -> MCE (INT18), *always* report (1)
>>>>>    -> * FW-First (implemented now)
>>>>>         -> CXL device -> FW
>>>>>            -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
>>>>>       * OS-First (not implemented yet, I'm working on it)
>>>>>         -> CXL device -> MSI
>>>>>            -> OS:CXL driver -> memory_failure() (2.b)
>>>>> so, the (1) and (2.a/b) are duplicated.
>>>>>
>>>>> (I didn't get response in my reply for [1] while I have to make patch to
>>>>> solve this problem, so please correct me if my understanding is wrong.)
>>>>
>>>> The CPU MCE may not be in the loop. Consider the case of patrol scrub,
>>>> or device-DMA accessing poison. In that case the device will signal a
>>>> component event and the CPU may never issue the MCE.
>>>
>>> My other patch: "cxl/core: add poison creation event handler", adds calling memory_failure() when an event is received form device, which checks the poison record (insert if not exists) to make sure poison would be reported/handled, but not twice, no matter CPU issues the MCE later or earlier.  And the lock of poison record makes sure that it's fine even in race condition.
>>>
>>>>
>>>> What is missing for me from this description is *why* does the duplicate
>>>> report matter in practice? If all that happens is that the kernel
>>>> repeats the lookup to offline the page and set the HWPoison bit, is that
>>>> duplicated work worth adding more tracking?
>>>
>>> Besides setting the HWPoison bit for the poison page, memory_failure() also finds and notifies those processes who are accessing the poison page, and tries to recovery the page.  And there seems no lock to prevent the 2nd memory_failure() and clearing poison operation from being called in race, then the HWPoison bit could be unsure.  I think that's the problem.
>>>
>>>   > So, I think all CXL poison notification events should trigger an
>>>   > action optional memory_failure(). I expect this needs to make sure
>>>   > that duplicates re not a problem. I.e. in the case of CPU consumption
>>>   > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED event via
>>>   > the MCE path *and* it may trigger the device to send an error record
>>>   > for the same page. As far as I can see, duplicate reports (MCE + CXL
>>>   > device) are unavoidable.
>>>
>>> As you mentioned in my other patch, this problem should be solved at first.  My patch adds the poison record (tracking, which might not become very large) to prevent duplicating report.
>>
>> Ping~
>>
>> And I had some problems when using xarray, please see below.
>>
>>>
>>>>
>>>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
>>>>> to check whether the current poison page has been reported (if yes,
>>>>> stop the notifier chain, won't call the following memory_failure()
>>>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
>>>>> page already handled(recorded and reported) in (1) or (2), the other one
>>>>> won't duplicate the report.  The record could be clear when
>>>>> cxl_clear_poison() is called.
>>>>>
>>>>> [1] https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.amr.corp.intel.com.notmuch/
>>>>>
>>>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>>>> ---
>>>>>    arch/x86/include/asm/mce.h |   1 +
>>>>>    drivers/cxl/core/mbox.c    | 130 +++++++++++++++++++++++++++++++++++++
>>>>>    drivers/cxl/core/memdev.c  |   6 +-
>>>>>    drivers/cxl/cxlmem.h       |   3 +
>>>>>    4 files changed, 139 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>>>>> index dfd2e9699bd7..d8109c48e7d9 100644
>>>>> --- a/arch/x86/include/asm/mce.h
>>>>> +++ b/arch/x86/include/asm/mce.h
>>>>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>>>>>        MCE_PRIO_NFIT,
>>>>>        MCE_PRIO_EXTLOG,
>>>>>        MCE_PRIO_UC,
>>>>> +    MCE_PRIO_CXL,
>>>>>        MCE_PRIO_EARLY,
>>>>>        MCE_PRIO_CEC,
>>>>>        MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>>>>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>>>>> index 2626f3fff201..0eb3c5401e81 100644
>>>>> --- a/drivers/cxl/core/mbox.c
>>>>> +++ b/drivers/cxl/core/mbox.c
>>>>> @@ -4,6 +4,8 @@
>>>>>    #include <linux/debugfs.h>
>>>>>    #include <linux/ktime.h>
>>>>>    #include <linux/mutex.h>
>>>>> +#include <linux/notifier.h>
>>>>> +#include <asm/mce.h>
>>>>>    #include <asm/unaligned.h>
>>>>>    #include <cxlpci.h>
>>>>>    #include <cxlmem.h>
>>>>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
>>>>>            if (cxlr)
>>>>>                hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>>>>> +        if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
>>>>> +            return;
>>>>> +
>>>>>            if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>>>>>                trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>>>>>                            &evt->gen_media);
>>>>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state *mds)
>>>>>    }
>>>>>    EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>>>>> +struct cxl_mce_record {
>>>>> +    struct list_head node;
>>>>> +    u64 hpa;
>>>>> +};
>>>>> +LIST_HEAD(cxl_mce_records);
>>>>> +DEFINE_MUTEX(cxl_mce_mutex);
>>>>
>>>> I would recommend an xarray for this use case as that already has its
>>>> own internal locking and efficient memory allocation for new nodes.
>>>>
>>>> However, the "why" question needs to be answered first.
>>>
>>> xarray does look better.  I didn't think of it befre.  Thanks for suggestion.
>>
>> I'm trying using xarray but but I'm running into two problems.
>>
>> The first one is: xarray stores an entry with a specified index, but here we only need to store the PFN.  I'm not sure how to specific an index for a PFN.  So I think xarray might not suitable for my case.
>> The second one: because we only need to store/search PFN, the list node only contains a list_head and a PFN. But xarray seems to require more memory for each node, which causes more overhead.
> 
> Use PFN as the index of the xarray since it's unique. And that makes finding the entry easy with xarray because you just pass in the PFN as the key. You can store an xarray entry with no data.

I didn't know it can store index with no data.  I'll try in this way.

> At this point you are just detecting whether the entry has been stored (valid) or not. Hope that helps.

Thank you very much!


--
Ruan.

> 
>>
>> Maybe I'm overthinking this?
>>
>>
>> -- 
>> Thanks
>> Ruan.
>>
>>>
>>> -- 
>>> Thanks
>>> Ruan.
shiju.jose--- via July 25, 2024, 2:51 a.m. UTC | #19
Hello, everyone!

> >>> 在 2024/6/22 1:51, Dan Williams 写道:
> >>>> Shiyang Ruan wrote:
> >>>>> Background:
> >>>>> Since CXL device is a memory device, while CPU consumes a poison
> page of
> >>>>> CXL device, it always triggers a MCE by interrupt (INT18), no matter
> >>>>> which-First path is configured.  This is the first report.  Then
> >>>>> currently, in FW-First path, the poison event is transferred according
> >>>>> to the following process: CXL device -> firmware ->
> OS:ACPI->APEI->GHES
> >>>>>    -> CPER -> trace report.  This is the second one.  These two
> reports
> >>>>> are indicating the same poisoning page, which is the so-called
> "duplicate
> >>>>> report"[1].  And the memory_failure() handling I'm trying to add in
> >>>>> OS-First path could also be another duplicate report.
> >>>>>
> >>>>> Hope the flow below could make it easier to understand:
> >>>>> CPU accesses bad memory on CXL device, then
> >>>>>    -> MCE (INT18), *always* report (1)
> >>>>>    -> * FW-First (implemented now)
> >>>>>         -> CXL device -> FW
> >>>>>            -> OS:ACPI->APEI->GHES->CPER -> trace report
> (2.a)
> >>>>>       * OS-First (not implemented yet, I'm working on it)
> >>>>>         -> CXL device -> MSI
> >>>>>            -> OS:CXL driver -> memory_failure() (2.b)
> >>>>> so, the (1) and (2.a/b) are duplicated.
> >>>>>
> >>>>> (I didn't get response in my reply for [1] while I have to make patch to
> >>>>> solve this problem, so please correct me if my understanding is
> wrong.)
> >>>>
> >>>> The CPU MCE may not be in the loop. Consider the case of patrol scrub,
> >>>> or device-DMA accessing poison. In that case the device will signal a
> >>>> component event and the CPU may never issue the MCE.
> >>>
> >>> My other patch: "cxl/core: add poison creation event handler", adds
> calling memory_failure() when an event is received form device, which checks
> the poison record (insert if not exists) to make sure poison would be
> reported/handled, but not twice, no matter CPU issues the MCE later or
> earlier.  And the lock of poison record makes sure that it's fine even in race
> condition.
> >>>
> >>>>
> >>>> What is missing for me from this description is *why* does the duplicate
> >>>> report matter in practice? If all that happens is that the kernel
> >>>> repeats the lookup to offline the page and set the HWPoison bit, is that
> >>>> duplicated work worth adding more tracking?
> >>>
> >>> Besides setting the HWPoison bit for the poison page, memory_failure()
> also finds and notifies those processes who are accessing the poison page,
> and tries to recovery the page.  And there seems no lock to prevent the 2nd
> memory_failure() and clearing poison operation from being called in race, then
> the HWPoison bit could be unsure.  I think that's the problem.
> >>>
> >>>   > So, I think all CXL poison notification events should trigger an
> >>>   > action optional memory_failure(). I expect this needs to make sure
> >>>   > that duplicates re not a problem. I.e. in the case of CPU consumption
> >>>   > of CXL poison, that causes a synchronous MF_ACTION_REQUIRED
> event via
> >>>   > the MCE path *and* it may trigger the device to send an error record
> >>>   > for the same page. As far as I can see, duplicate reports (MCE + CXL
> >>>   > device) are unavoidable.
> >>>
> >>> As you mentioned in my other patch, this problem should be solved at
> first.  My patch adds the poison record (tracking, which might not become
> very large) to prevent duplicating report.
> >>
> >> Ping~
> >>
> >> And I had some problems when using xarray, please see below.
> >>
> >>>
> >>>>
> >>>>> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL
> memdev
> >>>>> to check whether the current poison page has been reported (if yes,
> >>>>> stop the notifier chain, won't call the following memory_failure()
> >>>>> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> >>>>> page already handled(recorded and reported) in (1) or (2), the other
> one
> >>>>> won't duplicate the report.  The record could be clear when
> >>>>> cxl_clear_poison() is called.
> >>>>>
> >>>>> [1]
> https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be294f8@dwillia2-mobl3.a
> mr.corp.intel.com.notmuch/
> >>>>>
> >>>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> >>>>> ---
> >>>>>    arch/x86/include/asm/mce.h |   1 +
> >>>>>    drivers/cxl/core/mbox.c    | 130
> +++++++++++++++++++++++++++++++++++++
> >>>>>    drivers/cxl/core/memdev.c  |   6 +-
> >>>>>    drivers/cxl/cxlmem.h       |   3 +
> >>>>>    4 files changed, 139 insertions(+), 1 deletion(-)
> >>>>>
> >>>>> diff --git a/arch/x86/include/asm/mce.h
> b/arch/x86/include/asm/mce.h
> >>>>> index dfd2e9699bd7..d8109c48e7d9 100644
> >>>>> --- a/arch/x86/include/asm/mce.h
> >>>>> +++ b/arch/x86/include/asm/mce.h
> >>>>> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
> >>>>>        MCE_PRIO_NFIT,
> >>>>>        MCE_PRIO_EXTLOG,
> >>>>>        MCE_PRIO_UC,
> >>>>> +    MCE_PRIO_CXL,
> >>>>>        MCE_PRIO_EARLY,
> >>>>>        MCE_PRIO_CEC,
> >>>>>        MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> >>>>> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> >>>>> index 2626f3fff201..0eb3c5401e81 100644
> >>>>> --- a/drivers/cxl/core/mbox.c
> >>>>> +++ b/drivers/cxl/core/mbox.c
> >>>>> @@ -4,6 +4,8 @@
> >>>>>    #include <linux/debugfs.h>
> >>>>>    #include <linux/ktime.h>
> >>>>>    #include <linux/mutex.h>
> >>>>> +#include <linux/notifier.h>
> >>>>> +#include <asm/mce.h>
> >>>>>    #include <asm/unaligned.h>
> >>>>>    #include <cxlpci.h>
> >>>>>    #include <cxlmem.h>
> >>>>> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct
> cxl_memdev *cxlmd,
> >>>>>            if (cxlr)
> >>>>>                hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> >>>>> +        if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> >>>>> +            return;
> >>>>> +
> >>>>>            if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
> >>>>>                trace_cxl_general_media(cxlmd, type, cxlr, hpa,
> >>>>>                            &evt->gen_media);
> >>>>> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct
> cxl_memdev_state *mds)
> >>>>>    }
> >>>>>    EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
> >>>>> +struct cxl_mce_record {
> >>>>> +    struct list_head node;
> >>>>> +    u64 hpa;
> >>>>> +};
> >>>>> +LIST_HEAD(cxl_mce_records);
> >>>>> +DEFINE_MUTEX(cxl_mce_mutex);
> >>>>
> >>>> I would recommend an xarray for this use case as that already has its
> >>>> own internal locking and efficient memory allocation for new nodes.
> >>>>
> >>>> However, the "why" question needs to be answered first.
> >>>
> >>> xarray does look better.  I didn't think of it befre.  Thanks for suggestion.
> >>
> >> I'm trying using xarray but but I'm running into two problems.
> >>
> >> The first one is: xarray stores an entry with a specified index, but here we
> only need to store the PFN.  I'm not sure how to specific an index for a
> PFN.  So I think xarray might not suitable for my case.
> >> The second one: because we only need to store/search PFN, the list node
> only contains a list_head and a PFN. But xarray seems to require more memory
> for each node, which causes more overhead.
> >
> > Use PFN as the index of the xarray since it's unique. And that makes finding
> the entry easy with xarray because you just pass in the PFN as the key. You can
> store an xarray entry with no data.
> 
> I didn't know it can store index with no data.  I'll try in this way.
> 
> > At this point you are just detecting whether the entry has been stored (valid)
> or not. Hope that helps.
> 
> Thank you very much!

Are there any of you who are still unsure about the purpose and significance of creating this patch,
as we move forward with this implementation?
We have tried to summarize the purpose and current status of this development in the following email:
:
https://lore.kernel.org/linux-mm/83c705ce-a013-4a88-adcd-18dbc16d88df@fujitsu.com/T/#mb2e8781ca6530351309e71acca99c661f9582492

I believe we can implement this using xarray, but I'd like to reach an agreement on the
objectives at this stage before submitting any new xarray-based patches. 
If doubts about the purpose arise after submitting the patch, it could be a waste of
time and effort.

Any thoughts? Otherwise, can we go ahead?

Thanks,
-----
Yasunori Goto




> 
> 
> --
> Ruan.
> 
> >
> >>
> >> Maybe I'm overthinking this?
> >>
> >>
> >> --
> >> Thanks
> >> Ruan.
> >>
> >>>
> >>> --
> >>> Thanks
> >>> Ruan.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index dfd2e9699bd7..d8109c48e7d9 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -182,6 +182,7 @@  enum mce_notifier_prios {
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
 	MCE_PRIO_UC,
+	MCE_PRIO_CXL,
 	MCE_PRIO_EARLY,
 	MCE_PRIO_CEC,
 	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 2626f3fff201..0eb3c5401e81 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -4,6 +4,8 @@ 
 #include <linux/debugfs.h>
 #include <linux/ktime.h>
 #include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <asm/mce.h>
 #include <asm/unaligned.h>
 #include <cxlpci.h>
 #include <cxlmem.h>
@@ -880,6 +882,9 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		if (cxlr)
 			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
 
+		if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
+			return;
+
 		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
 			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
 						&evt->gen_media);
@@ -1408,6 +1413,127 @@  int cxl_poison_state_init(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
 
+struct cxl_mce_record {
+	struct list_head node;
+	u64 hpa;
+};
+LIST_HEAD(cxl_mce_records);
+DEFINE_MUTEX(cxl_mce_mutex);
+
+bool cxl_mce_recorded(u64 hpa)
+{
+	struct cxl_mce_record *cur, *next, *rec;
+	int rc;
+
+	rc = mutex_lock_interruptible(&cxl_mce_mutex);
+	if (rc)
+		return false;
+
+	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
+		if (cur->hpa == hpa) {
+			mutex_unlock(&cxl_mce_mutex);
+			return true;
+		}
+	}
+
+	rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
+	rec->hpa = hpa;
+	list_add(&cxl_mce_records, &rec->node);
+
+	mutex_unlock(&cxl_mce_mutex);
+
+	return false;
+}
+
+void cxl_mce_clear(u64 hpa)
+{
+	struct cxl_mce_record *cur, *next;
+	int rc;
+
+	rc = mutex_lock_interruptible(&cxl_mce_mutex);
+	if (rc)
+		return;
+
+	list_for_each_entry_safe(cur, next, &cxl_mce_records, node) {
+		if (cur->hpa == hpa) {
+			list_del(&cur->node);
+			break;
+		}
+	}
+
+	mutex_unlock(&cxl_mce_mutex);
+}
+
+struct cxl_contains_hpa_context {
+	bool contains;
+	u64 hpa;
+};
+
+static int __cxl_contains_hpa(struct device *dev, void *arg)
+{
+	struct cxl_contains_hpa_context *ctx = arg;
+	struct cxl_endpoint_decoder *cxled;
+	struct range *range;
+	u64 hpa = ctx->hpa;
+
+	if (!is_endpoint_decoder(dev))
+		return 0;
+
+	cxled = to_cxl_endpoint_decoder(dev);
+	range = &cxled->cxld.hpa_range;
+
+	if (range->start <= hpa && hpa <= range->end) {
+		ctx->contains = true;
+		return 1;
+	}
+
+	return 0;
+}
+
+static bool cxl_contains_hpa(const struct cxl_memdev *cxlmd, u64 hpa)
+{
+	struct cxl_contains_hpa_context ctx = {
+		.contains = false,
+		.hpa = hpa,
+	};
+	struct cxl_port *port;
+
+	port = cxlmd->endpoint;
+	if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
+		device_for_each_child(&port->dev, &ctx, __cxl_contains_hpa);
+
+	return ctx.contains;
+}
+
+static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
+			  void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
+						    mce_notifier);
+	u64 hpa;
+
+	if (!mce || !mce_usable_address(mce))
+		return NOTIFY_DONE;
+
+	hpa = mce->addr & MCI_ADDR_PHYSADDR;
+
+	/* Check if the PFN is located on this CXL device */
+	if (!pfn_valid(hpa >> PAGE_SHIFT) &&
+	    !cxl_contains_hpa(mds->cxlds.cxlmd, hpa))
+		return NOTIFY_DONE;
+
+	/*
+	 * Search PFN in the cxl_mce_records, if already exists, don't continue
+	 * to do memory_failure() to avoid a poison address being reported
+	 * more than once.
+	 */
+	if (cxl_mce_recorded(hpa))
+		return NOTIFY_STOP;
+	else
+		return NOTIFY_OK;
+}
+
 struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
 {
 	struct cxl_memdev_state *mds;
@@ -1427,6 +1553,10 @@  struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
 	mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
 	mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
 
+	mds->mce_notifier.notifier_call = cxl_handle_mce;
+	mds->mce_notifier.priority = MCE_PRIO_CXL;
+	mce_register_decode_chain(&mds->mce_notifier);
+
 	return mds;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, CXL);
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 0277726afd04..aa3ac89d17be 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -376,10 +376,14 @@  int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
 		goto out;
 
 	cxlr = cxl_dpa_to_region(cxlmd, dpa);
-	if (cxlr)
+	if (cxlr) {
+		u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
+
+		cxl_mce_clear(hpa);
 		dev_warn_once(mds->cxlds.dev,
 			      "poison clear dpa:%#llx region: %s\n", dpa,
 			      dev_name(&cxlr->dev));
+	}
 
 	record = (struct cxl_poison_record) {
 		.address = cpu_to_le64(dpa),
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 19aba81cdf13..fbf8d9f46984 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -501,6 +501,7 @@  struct cxl_memdev_state {
 	struct cxl_fw_state fw;
 
 	struct rcuwait mbox_wait;
+	struct notifier_block mce_notifier;
 	int (*mbox_send)(struct cxl_memdev_state *mds,
 			 struct cxl_mbox_cmd *cmd);
 };
@@ -836,6 +837,8 @@  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
 int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
 int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
 int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
+bool cxl_mce_recorded(u64 pfn);
+void cxl_mce_clear(u64 pfn);
 
 #ifdef CONFIG_CXL_SUSPEND
 void cxl_mem_active_inc(void);