diff mbox series

[RFC,4/5] cxl/core: add report option for cxl_mem_get_poison()

Message ID 20240209115417.724638-7-ruansy.fnst@fujitsu.com
State New, archived
Headers show
Series [RFC,1/5] cxl/core: correct length of DPA field masks | expand

Commit Message

Shiyang Ruan Feb. 9, 2024, 11:54 a.m. UTC
When a poison event is received, driver uses GET_POISON_LIST command
to get the poison list.  Now driver has cxl_mem_get_poison(), so
reuse it and add a parameter 'bool report', report poison record to MCE
if set true.

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 drivers/cxl/core/mbox.c   | 7 +++++--
 drivers/cxl/core/memdev.c | 4 ++--
 drivers/cxl/core/region.c | 8 ++++----
 drivers/cxl/cxlmem.h      | 2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

Comments

Dan Williams Feb. 10, 2024, 6:49 a.m. UTC | #1
Shiyang Ruan wrote:
> When a poison event is received, driver uses GET_POISON_LIST command
> to get the poison list.  Now driver has cxl_mem_get_poison(), so
> reuse it and add a parameter 'bool report', report poison record to MCE
> if set true.

If the memory error record has the poison event, why does the poison
list need to be retrieved by the kernel? I would expect it is sufficient
to just report the single poison event and leave it to userspace to
react to that event and retrieve more data if it wants.
Shiyang Ruan March 14, 2024, 3:01 p.m. UTC | #2
在 2024/2/10 14:49, Dan Williams 写道:
> Shiyang Ruan wrote:
>> When a poison event is received, driver uses GET_POISON_LIST command
>> to get the poison list.  Now driver has cxl_mem_get_poison(), so
>> reuse it and add a parameter 'bool report', report poison record to MCE
>> if set true.
> 
> If the memory error record has the poison event, why does the poison
> list need to be retrieved by the kernel? I would expect it is sufficient
> to just report the single poison event and leave it to userspace to
> react to that event and retrieve more data if it wants.

The GMER has only physical address field, no range/length of the POISON, 
we can't get the poison range from the single event record.  Since the 
POISON range is injected by one command, one GMER is sent to driver, we 
have to use GET_POISON_LIST command to get the length.


--
Thanks,
Ruan.
diff mbox series

Patch

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index f9b6f50fbe80..e1c67159acc4 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1324,7 +1324,7 @@  static void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
 }
 
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
-		       struct cxl_region *cxlr)
+		       struct cxl_region *cxlr, bool report)
 {
 	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
 	struct cxl_mbox_poison_out *po;
@@ -1355,10 +1355,13 @@  int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
 		if (rc)
 			break;
 
-		for (int i = 0; i < le16_to_cpu(po->count); i++)
+		for (int i = 0; i < le16_to_cpu(po->count); i++) {
 			trace_cxl_poison(cxlmd, cxlr, &po->record[i],
 					 po->flags, po->overflow_ts,
 					 CXL_POISON_TRACE_LIST);
+			if (report)
+				cxl_mem_report_poison(cxlmd, &po->record[i]);
+		}
 
 		/* Protect against an uncleared _FLAG_MORE */
 		nr_records = nr_records + le16_to_cpu(po->count);
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index c304e709ef0e..320bcb8af5b0 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -200,14 +200,14 @@  static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd)
 	if (resource_size(&cxlds->pmem_res)) {
 		offset = cxlds->pmem_res.start;
 		length = resource_size(&cxlds->pmem_res);
-		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 		if (rc)
 			return rc;
 	}
 	if (resource_size(&cxlds->ram_res)) {
 		offset = cxlds->ram_res.start;
 		length = resource_size(&cxlds->ram_res);
-		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 		/*
 		 * Invalid Physical Address is not an error for
 		 * volatile addresses. Device support is optional.
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index ce0e2d82bb2b..b008c7e13560 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2386,7 +2386,7 @@  static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
 	if (ctx->mode == CXL_DECODER_RAM) {
 		offset = ctx->offset;
 		length = resource_size(&cxlds->ram_res) - offset;
-		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 		if (rc == -EFAULT)
 			rc = 0;
 		if (rc)
@@ -2404,7 +2404,7 @@  static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
 		return 0;
 	}
 
-	return cxl_mem_get_poison(cxlmd, offset, length, NULL);
+	return cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 }
 
 static int poison_by_decoder(struct device *dev, void *arg)
@@ -2438,7 +2438,7 @@  static int poison_by_decoder(struct device *dev, void *arg)
 	if (cxled->skip) {
 		offset = cxled->dpa_res->start - cxled->skip;
 		length = cxled->skip;
-		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 		if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
 			rc = 0;
 		if (rc)
@@ -2447,7 +2447,7 @@  static int poison_by_decoder(struct device *dev, void *arg)
 
 	offset = cxled->dpa_res->start;
 	length = cxled->dpa_res->end - offset + 1;
-	rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
+	rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region, false);
 	if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
 		rc = 0;
 	if (rc)
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 97ddab421e63..f0877f055f53 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -831,7 +831,7 @@  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
-		       struct cxl_region *cxlr);
+		       struct cxl_region *cxlr, bool report);
 int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
 phys_addr_t cxl_memdev_dpa_to_hpa(struct cxl_memdev *cxlmd, u64 dpa);
 int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);