diff mbox series

[v2] drm/panthor: Make sure we handle 'unknown group state' case properly

Message ID 20240502155248.1430582-1-boris.brezillon@collabora.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/panthor: Make sure we handle 'unknown group state' case properly | expand

Commit Message

Boris Brezillon May 2, 2024, 3:52 p.m. UTC
When we check for state values returned by the FW, we only cover part of
the 0:7 range. Make sure we catch FW inconsistencies by adding a default
to the switch statement, and flagging the group state as unknown in that
case.

When an unknown state is detected, we trigger a reset, and consider the
group as unusable after that point, to prevent the potential corruption
from creeping in other places if we continue executing stuff on this
context.

v2:
- Add Steve's R-b
- Fix commit message

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Suggested-by: Steven Price <steven.price@arm.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
---
 drivers/gpu/drm/panthor/panthor_sched.c | 37 +++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

Comments

Boris Brezillon May 2, 2024, 4:44 p.m. UTC | #1
On Thu,  2 May 2024 17:52:48 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> When we check for state values returned by the FW, we only cover part of
> the 0:7 range. Make sure we catch FW inconsistencies by adding a default
> to the switch statement, and flagging the group state as unknown in that
> case.
> 
> When an unknown state is detected, we trigger a reset, and consider the
> group as unusable after that point, to prevent the potential corruption
> from creeping in other places if we continue executing stuff on this
> context.
> 
> v2:
> - Add Steve's R-b
> - Fix commit message
> 
> Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> Suggested-by: Steven Price <steven.price@arm.com>
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Reviewed-by: Steven Price <steven.price@arm.com>
> Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>

Queued to drm-misc-next-fixes.

> ---
>  drivers/gpu/drm/panthor/panthor_sched.c | 37 +++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index b3a51a6de523..1a14ee30f774 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -490,6 +490,18 @@ enum panthor_group_state {
>  	 * Can no longer be scheduled. The only allowed action is a destruction.
>  	 */
>  	PANTHOR_CS_GROUP_TERMINATED,
> +
> +	/**
> +	 * @PANTHOR_CS_GROUP_UNKNOWN_STATE: Group is an unknown state.
> +	 *
> +	 * The FW returned an inconsistent state. The group is flagged unusable
> +	 * and can no longer be scheduled. The only allowed action is a
> +	 * destruction.
> +	 *
> +	 * When that happens, we also schedule a FW reset, to start from a fresh
> +	 * state.
> +	 */
> +	PANTHOR_CS_GROUP_UNKNOWN_STATE,
>  };
>  
>  /**
> @@ -1127,6 +1139,7 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
>  	struct panthor_fw_csg_iface *csg_iface;
>  	struct panthor_group *group;
>  	enum panthor_group_state new_state, old_state;
> +	u32 csg_state;
>  
>  	lockdep_assert_held(&ptdev->scheduler->lock);
>  
> @@ -1137,7 +1150,8 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
>  		return;
>  
>  	old_state = group->state;
> -	switch (csg_iface->output->ack & CSG_STATE_MASK) {
> +	csg_state = csg_iface->output->ack & CSG_STATE_MASK;
> +	switch (csg_state) {
>  	case CSG_STATE_START:
>  	case CSG_STATE_RESUME:
>  		new_state = PANTHOR_CS_GROUP_ACTIVE;
> @@ -1148,11 +1162,28 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
>  	case CSG_STATE_SUSPEND:
>  		new_state = PANTHOR_CS_GROUP_SUSPENDED;
>  		break;
> +	default:
> +		/* The unknown state might be caused by a FW state corruption,
> +		 * which means the group metadata can't be trusted anymore, and
> +		 * the SUSPEND operation might propagate the corruption to the
> +		 * suspend buffers. Flag the group state as unknown to make
> +		 * sure it's unusable after that point.
> +		 */
> +		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
> +			csg_id, csg_state);
> +		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
> +		break;
>  	}
>  
>  	if (old_state == new_state)
>  		return;
>  
> +	/* The unknown state might be caused by a FW issue, reset the FW to
> +	 * take a fresh start.
> +	 */
> +	if (new_state == PANTHOR_CS_GROUP_UNKNOWN_STATE)
> +		panthor_device_schedule_reset(ptdev);
> +
>  	if (new_state == PANTHOR_CS_GROUP_SUSPENDED)
>  		csg_slot_sync_queues_state_locked(ptdev, csg_id);
>  
> @@ -1783,6 +1814,7 @@ static bool
>  group_can_run(struct panthor_group *group)
>  {
>  	return group->state != PANTHOR_CS_GROUP_TERMINATED &&
> +	       group->state != PANTHOR_CS_GROUP_UNKNOWN_STATE &&
>  	       !group->destroyed && group->fatal_queues == 0 &&
>  	       !group->timedout;
>  }
> @@ -2557,7 +2589,8 @@ void panthor_sched_suspend(struct panthor_device *ptdev)
>  
>  		if (csg_slot->group) {
>  			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
> -						CSG_STATE_SUSPEND,
> +						group_can_run(csg_slot->group) ?
> +						CSG_STATE_SUSPEND : CSG_STATE_TERMINATE,
>  						CSG_STATE_MASK);
>  		}
>  	}
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index b3a51a6de523..1a14ee30f774 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -490,6 +490,18 @@  enum panthor_group_state {
 	 * Can no longer be scheduled. The only allowed action is a destruction.
 	 */
 	PANTHOR_CS_GROUP_TERMINATED,
+
+	/**
+	 * @PANTHOR_CS_GROUP_UNKNOWN_STATE: Group is an unknown state.
+	 *
+	 * The FW returned an inconsistent state. The group is flagged unusable
+	 * and can no longer be scheduled. The only allowed action is a
+	 * destruction.
+	 *
+	 * When that happens, we also schedule a FW reset, to start from a fresh
+	 * state.
+	 */
+	PANTHOR_CS_GROUP_UNKNOWN_STATE,
 };
 
 /**
@@ -1127,6 +1139,7 @@  csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
 	struct panthor_fw_csg_iface *csg_iface;
 	struct panthor_group *group;
 	enum panthor_group_state new_state, old_state;
+	u32 csg_state;
 
 	lockdep_assert_held(&ptdev->scheduler->lock);
 
@@ -1137,7 +1150,8 @@  csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
 		return;
 
 	old_state = group->state;
-	switch (csg_iface->output->ack & CSG_STATE_MASK) {
+	csg_state = csg_iface->output->ack & CSG_STATE_MASK;
+	switch (csg_state) {
 	case CSG_STATE_START:
 	case CSG_STATE_RESUME:
 		new_state = PANTHOR_CS_GROUP_ACTIVE;
@@ -1148,11 +1162,28 @@  csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
 	case CSG_STATE_SUSPEND:
 		new_state = PANTHOR_CS_GROUP_SUSPENDED;
 		break;
+	default:
+		/* The unknown state might be caused by a FW state corruption,
+		 * which means the group metadata can't be trusted anymore, and
+		 * the SUSPEND operation might propagate the corruption to the
+		 * suspend buffers. Flag the group state as unknown to make
+		 * sure it's unusable after that point.
+		 */
+		drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
+			csg_id, csg_state);
+		new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+		break;
 	}
 
 	if (old_state == new_state)
 		return;
 
+	/* The unknown state might be caused by a FW issue, reset the FW to
+	 * take a fresh start.
+	 */
+	if (new_state == PANTHOR_CS_GROUP_UNKNOWN_STATE)
+		panthor_device_schedule_reset(ptdev);
+
 	if (new_state == PANTHOR_CS_GROUP_SUSPENDED)
 		csg_slot_sync_queues_state_locked(ptdev, csg_id);
 
@@ -1783,6 +1814,7 @@  static bool
 group_can_run(struct panthor_group *group)
 {
 	return group->state != PANTHOR_CS_GROUP_TERMINATED &&
+	       group->state != PANTHOR_CS_GROUP_UNKNOWN_STATE &&
 	       !group->destroyed && group->fatal_queues == 0 &&
 	       !group->timedout;
 }
@@ -2557,7 +2589,8 @@  void panthor_sched_suspend(struct panthor_device *ptdev)
 
 		if (csg_slot->group) {
 			csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
-						CSG_STATE_SUSPEND,
+						group_can_run(csg_slot->group) ?
+						CSG_STATE_SUSPEND : CSG_STATE_TERMINATE,
 						CSG_STATE_MASK);
 		}
 	}