diff mbox series

[v2,1/1] drivers/perf: Fix kernel panic due to the invalid mon_ctx pointer

Message ID 20231026233443.3493667-1-sdonthineni@nvidia.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/1] drivers/perf: Fix kernel panic due to the invalid mon_ctx pointer | expand

Commit Message

Shanker Donthineni Oct. 26, 2023, 11:34 p.m. UTC
The return pointer from the resctrl_arch_mon_ctx_alloc_no_wait() function
is saved in a 32-bit variable 'hwc->idx' which results in the loss of
the upper 32 bits. This, in turn, triggers a kernel panic when attempting
to access a corrupted pointer.

Use 'event->pmu_private' instead of 'hwc->idx' to resolve the issue.

Another reason contributing to the problem due to continuous 'error irq'
messages. The resctrl_arch_mon_ctx_alloc_no_wait() function returns a valid
PTR when it cannot allocate monitor resources, resulting in an incorrect
MON_SEL configuration. To resolve this issue, return an ERR_PTR when monitor
allocation fails

dmesg:
  mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
  mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
  mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
  ...
  mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0

Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
---
Changes in v2:
 -Use 'event->pmu_private' instead of 'hwc->idx' to keep monitor context
 -Return ERR_PTR if unable to allocate resource in resctrl_arch_mon_ctx_alloc_no_wait()

 drivers/perf/resctrl_pmu.c           | 12 ++++++------
 drivers/platform/mpam/mpam_resctrl.c | 23 ++++++++++++++++-------
 2 files changed, 22 insertions(+), 13 deletions(-)

Comments

Will Deacon Oct. 27, 2023, 10:27 a.m. UTC | #1
On Thu, Oct 26, 2023 at 06:34:43PM -0500, Shanker Donthineni wrote:
> The return pointer from the resctrl_arch_mon_ctx_alloc_no_wait() function
> is saved in a 32-bit variable 'hwc->idx' which results in the loss of
> the upper 32 bits. This, in turn, triggers a kernel panic when attempting
> to access a corrupted pointer.
> 
> Use 'event->pmu_private' instead of 'hwc->idx' to resolve the issue.
> 
> Another reason contributing to the problem due to continuous 'error irq'
> messages. The resctrl_arch_mon_ctx_alloc_no_wait() function returns a valid
> PTR when it cannot allocate monitor resources, resulting in an incorrect
> MON_SEL configuration. To resolve this issue, return an ERR_PTR when monitor
> allocation fails
> 
> dmesg:
>   mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
>   mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
>   mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
>   ...
>   mpam: error irq from msc:0 'Monitor_Range', partid:228, pmg: 0, ris: 0
> 
> Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
> ---
> Changes in v2:
>  -Use 'event->pmu_private' instead of 'hwc->idx' to keep monitor context
>  -Return ERR_PTR if unable to allocate resource in resctrl_arch_mon_ctx_alloc_no_wait()
> 
>  drivers/perf/resctrl_pmu.c           | 12 ++++++------
>  drivers/platform/mpam/mpam_resctrl.c | 23 ++++++++++++++++-------

I can't see these files upstream or in linux-next, so please don't cc me on
changes to them until they've landed. My inbox is bad enough as it is!

Thanks,

Will
diff mbox series

Patch

diff --git a/drivers/perf/resctrl_pmu.c b/drivers/perf/resctrl_pmu.c
index 99a2b90b5d83..1531c8c6fc31 100644
--- a/drivers/perf/resctrl_pmu.c
+++ b/drivers/perf/resctrl_pmu.c
@@ -66,7 +66,6 @@  static struct rdt_resource *resctrl_event_get_resource(u16 event_num)
 
 static void resctrl_pmu_event_destroy(struct perf_event *event)
 {
-	struct hw_perf_event *hwc = &event->hw;
 	u16 event_num = get_event(event);
 	struct rdt_resource *r;
 
@@ -74,7 +73,7 @@  static void resctrl_pmu_event_destroy(struct perf_event *event)
 	if (!r)
 		return;
 
-	resctrl_arch_mon_ctx_free(r, event_num, hwc->idx);
+	resctrl_arch_mon_ctx_free(r, event_num, event->pmu_private);
 }
 
 static int resctrl_pmu_event_init(struct perf_event *event)
@@ -144,9 +143,9 @@  static int resctrl_pmu_event_init(struct perf_event *event)
 			return -EINVAL;
 	}
 
-	hwc->idx = resctrl_arch_mon_ctx_alloc_no_wait(r, event_num);
-	if (hwc->idx == -ENOSPC)
-		return -ENOSPC;
+	event->pmu_private = resctrl_arch_mon_ctx_alloc_no_wait(r, event_num);
+	if (event->pmu_private && IS_ERR(event->pmu_private))
+		return PTR_ERR(event->pmu_private);
 	event->destroy = resctrl_pmu_event_destroy;
 	local64_set(&hwc->prev_count, 0);
 	local64_set(&event->count, 0);
@@ -183,7 +182,8 @@  static void resctrl_pmu_event_update(struct perf_event *event)
 		prev = local64_read(&hwc->prev_count);
 
 		err = resctrl_arch_rmid_read(r, d, closid, rmid,
-					     event_num, &now, hwc->idx);
+					     event_num, &now,
+					     event->pmu_private);
 		if (err)
 			return;
 	} while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
diff --git a/drivers/platform/mpam/mpam_resctrl.c b/drivers/platform/mpam/mpam_resctrl.c
index 1a691535d23d..8ec963aca9d4 100644
--- a/drivers/platform/mpam/mpam_resctrl.c
+++ b/drivers/platform/mpam/mpam_resctrl.c
@@ -318,28 +318,37 @@  struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
 void *resctrl_arch_mon_ctx_alloc_no_wait(struct rdt_resource *r, int evtid)
 {
 	struct mpam_resctrl_res *res;
-	u32 *ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+	u32 *ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	int err;
 
-	if (!ret)
+	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
 	switch (evtid) {
 	case QOS_L3_OCCUP_EVENT_ID:
 		res = container_of(r, struct mpam_resctrl_res, resctrl_res);
 
-		*ret = mpam_alloc_csu_mon(res->class);
-		return ret;
+		err = mpam_alloc_csu_mon(res->class);
+		break;
 	case QOS_L3_MBM_LOCAL_EVENT_ID:
 	case QOS_L3_MBM_TOTAL_EVENT_ID:
 		if (mpam_monitors_free_runing)
 			return mon_is_rmid_idx;
 		res = container_of(r, struct mpam_resctrl_res, resctrl_res);
 
-		*ret = mpam_alloc_mbwu_mon(res->class);
-		return ret;
+		err = mpam_alloc_mbwu_mon(res->class);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+	if (err < 0) {
+		kfree(ctx);
+		return ERR_PTR(err);
 	}
 
-	return ERR_PTR(-EOPNOTSUPP);
+	*ctx = err;
+	return ctx;
 }
 
 void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,