diff mbox series

[v3,3/3] drm/panthor: Report innocent group kill

Message ID 20241029152912.270346-4-boris.brezillon@collabora.com (mailing list archive)
State New
Headers show
Series drm/panthor: Fix group state reporting | expand

Commit Message

Boris Brezillon Oct. 29, 2024, 3:29 p.m. UTC
Groups can be killed during a reset even though they did nothing wrong.
That usually happens when the FW is put in a bad state by other groups,
resulting in group suspension failures when the reset happens.

If we end up in that situation, flag the group innocent and report
innocence through a new DRM_PANTHOR_GROUP_STATE flag.

Bump the minor driver version to reflect the uAPI change.

Changes in v3:
- Actually report innocence to userspace

Changes in v2:
- New patch

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 drivers/gpu/drm/panthor/panthor_drv.c   |  2 +-
 drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++++++++++
 include/uapi/drm/panthor_drm.h          |  9 +++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

Comments

Boris Brezillon Oct. 29, 2024, 4:19 p.m. UTC | #1
On Tue, 29 Oct 2024 16:29:12 +0100
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> Groups can be killed during a reset even though they did nothing wrong.
> That usually happens when the FW is put in a bad state by other groups,
> resulting in group suspension failures when the reset happens.
> 
> If we end up in that situation, flag the group innocent and report
> innocence through a new DRM_PANTHOR_GROUP_STATE flag.
> 
> Bump the minor driver version to reflect the uAPI change.
> 
> Changes in v3:
> - Actually report innocence to userspace
> 
> Changes in v2:
> - New patch
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
>  drivers/gpu/drm/panthor/panthor_drv.c   |  2 +-
>  drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++++++++++
>  include/uapi/drm/panthor_drm.h          |  9 +++++++++
>  3 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> index ac7e53f6e3f0..f1dff7e0173d 100644
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c

Forgot to update the changelog with:

--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1493,6 +1493,7 @@ static void panthor_debugfs_init(struct drm_minor
*minor)
  * - 1.1 - adds DEV_QUERY_TIMESTAMP_INFO query
  * - 1.2 - adds DEV_QUERY_GROUP_PRIORITIES_INFO query
  *       - adds PANTHOR_GROUP_PRIORITY_REALTIME priority
+ * - 1.3 - adds DRM_PANTHOR_GROUP_STATE_INNOCENT flag
  */

I'll send a v4 addressing that, but I'll probably queue the first two
patches to drm-misc-fixes first.

> @@ -1507,7 +1507,7 @@ static const struct drm_driver panthor_drm_driver = {
>  	.desc = "Panthor DRM driver",
>  	.date = "20230801",
>  	.major = 1,
> -	.minor = 2,
> +	.minor = 3,
>  
>  	.gem_create_object = panthor_gem_create_object,
>  	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> index ef4bec7ff9c7..97ed5fe5a191 100644
> --- a/drivers/gpu/drm/panthor/panthor_sched.c
> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> @@ -610,6 +610,16 @@ struct panthor_group {
>  	 */
>  	bool timedout;
>  
> +	/**
> +	 * @innocent: True when the group becomes unusable because the group suspension
> +	 * failed during a reset.
> +	 *
> +	 * Sometimes the FW was put in a bad state by other groups, causing the group
> +	 * suspension happening in the reset path to fail. In that case, we consider the
> +	 * group innocent.
> +	 */
> +	bool innocent;
> +
>  	/**
>  	 * @syncobjs: Pool of per-queue synchronization objects.
>  	 *
> @@ -2690,6 +2700,12 @@ void panthor_sched_suspend(struct panthor_device *ptdev)
>  			u32 csg_id = ffs(slot_mask) - 1;
>  			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
>  
> +			/* If the group was still usable before that point, we consider
> +			 * it innocent.
> +			 */
> +			if (group_can_run(csg_slot->group))
> +				csg_slot->group->innocent = true;
> +
>  			/* We consider group suspension failures as fatal and flag the
>  			 * group as unusable by setting timedout=true.
>  			 */
> @@ -3570,6 +3586,8 @@ int panthor_group_get_state(struct panthor_file *pfile,
>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
>  		get_state->fatal_queues = group->fatal_queues;
>  	}
> +	if (group->innocent)
> +		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
>  	mutex_unlock(&sched->lock);
>  
>  	group_put(group);
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 87c9cb555dd1..b99763cbae48 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -923,6 +923,15 @@ enum drm_panthor_group_state_flags {
>  	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
>  	 */
>  	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
> +
> +	/**
> +	 * @DRM_PANTHOR_GROUP_STATE_INNOCENT: Group was killed during a reset caused by other
> +	 * groups.
> +	 *
> +	 * This flag can only be set if DRM_PANTHOR_GROUP_STATE_TIMEDOUT is set and
> +	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
> +	 */
> +	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
>  };
>  
>  /**
Steven Price Oct. 30, 2024, 1:18 p.m. UTC | #2
On 29/10/2024 16:19, Boris Brezillon wrote:
> On Tue, 29 Oct 2024 16:29:12 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
>> Groups can be killed during a reset even though they did nothing wrong.
>> That usually happens when the FW is put in a bad state by other groups,
>> resulting in group suspension failures when the reset happens.
>>
>> If we end up in that situation, flag the group innocent and report
>> innocence through a new DRM_PANTHOR_GROUP_STATE flag.
>>
>> Bump the minor driver version to reflect the uAPI change.
>>
>> Changes in v3:
>> - Actually report innocence to userspace
>>
>> Changes in v2:
>> - New patch
>>
>> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
>> ---
>>  drivers/gpu/drm/panthor/panthor_drv.c   |  2 +-
>>  drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++++++++++
>>  include/uapi/drm/panthor_drm.h          |  9 +++++++++
>>  3 files changed, 28 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
>> index ac7e53f6e3f0..f1dff7e0173d 100644
>> --- a/drivers/gpu/drm/panthor/panthor_drv.c
>> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> 
> Forgot to update the changelog with:
> 
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -1493,6 +1493,7 @@ static void panthor_debugfs_init(struct drm_minor
> *minor)
>   * - 1.1 - adds DEV_QUERY_TIMESTAMP_INFO query
>   * - 1.2 - adds DEV_QUERY_GROUP_PRIORITIES_INFO query
>   *       - adds PANTHOR_GROUP_PRIORITY_REALTIME priority
> + * - 1.3 - adds DRM_PANTHOR_GROUP_STATE_INNOCENT flag
>   */
> 
> I'll send a v4 addressing that, but I'll probably queue the first two
> patches to drm-misc-fixes first.

With the changelog update you can add:

Reviewed-by: Steven Price <steven.price@arm.com>

Thanks,
Steve

>> @@ -1507,7 +1507,7 @@ static const struct drm_driver panthor_drm_driver = {
>>  	.desc = "Panthor DRM driver",
>>  	.date = "20230801",
>>  	.major = 1,
>> -	.minor = 2,
>> +	.minor = 3,
>>  
>>  	.gem_create_object = panthor_gem_create_object,
>>  	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
>> diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
>> index ef4bec7ff9c7..97ed5fe5a191 100644
>> --- a/drivers/gpu/drm/panthor/panthor_sched.c
>> +++ b/drivers/gpu/drm/panthor/panthor_sched.c
>> @@ -610,6 +610,16 @@ struct panthor_group {
>>  	 */
>>  	bool timedout;
>>  
>> +	/**
>> +	 * @innocent: True when the group becomes unusable because the group suspension
>> +	 * failed during a reset.
>> +	 *
>> +	 * Sometimes the FW was put in a bad state by other groups, causing the group
>> +	 * suspension happening in the reset path to fail. In that case, we consider the
>> +	 * group innocent.
>> +	 */
>> +	bool innocent;
>> +
>>  	/**
>>  	 * @syncobjs: Pool of per-queue synchronization objects.
>>  	 *
>> @@ -2690,6 +2700,12 @@ void panthor_sched_suspend(struct panthor_device *ptdev)
>>  			u32 csg_id = ffs(slot_mask) - 1;
>>  			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
>>  
>> +			/* If the group was still usable before that point, we consider
>> +			 * it innocent.
>> +			 */
>> +			if (group_can_run(csg_slot->group))
>> +				csg_slot->group->innocent = true;
>> +
>>  			/* We consider group suspension failures as fatal and flag the
>>  			 * group as unusable by setting timedout=true.
>>  			 */
>> @@ -3570,6 +3586,8 @@ int panthor_group_get_state(struct panthor_file *pfile,
>>  		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
>>  		get_state->fatal_queues = group->fatal_queues;
>>  	}
>> +	if (group->innocent)
>> +		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
>>  	mutex_unlock(&sched->lock);
>>  
>>  	group_put(group);
>> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
>> index 87c9cb555dd1..b99763cbae48 100644
>> --- a/include/uapi/drm/panthor_drm.h
>> +++ b/include/uapi/drm/panthor_drm.h
>> @@ -923,6 +923,15 @@ enum drm_panthor_group_state_flags {
>>  	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
>>  	 */
>>  	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
>> +
>> +	/**
>> +	 * @DRM_PANTHOR_GROUP_STATE_INNOCENT: Group was killed during a reset caused by other
>> +	 * groups.
>> +	 *
>> +	 * This flag can only be set if DRM_PANTHOR_GROUP_STATE_TIMEDOUT is set and
>> +	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
>> +	 */
>> +	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
>>  };
>>  
>>  /**
>
Liviu Dudau Oct. 30, 2024, 1:53 p.m. UTC | #3
On Tue, Oct 29, 2024 at 05:19:33PM +0100, Boris Brezillon wrote:
> On Tue, 29 Oct 2024 16:29:12 +0100
> Boris Brezillon <boris.brezillon@collabora.com> wrote:
> 
> > Groups can be killed during a reset even though they did nothing wrong.
> > That usually happens when the FW is put in a bad state by other groups,
> > resulting in group suspension failures when the reset happens.
> > 
> > If we end up in that situation, flag the group innocent and report
> > innocence through a new DRM_PANTHOR_GROUP_STATE flag.
> > 
> > Bump the minor driver version to reflect the uAPI change.
> > 
> > Changes in v3:
> > - Actually report innocence to userspace
> > 
> > Changes in v2:
> > - New patch
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> > ---
> >  drivers/gpu/drm/panthor/panthor_drv.c   |  2 +-
> >  drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++++++++++
> >  include/uapi/drm/panthor_drm.h          |  9 +++++++++
> >  3 files changed, 28 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
> > index ac7e53f6e3f0..f1dff7e0173d 100644
> > --- a/drivers/gpu/drm/panthor/panthor_drv.c
> > +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> 
> Forgot to update the changelog with:
> 
> --- a/drivers/gpu/drm/panthor/panthor_drv.c
> +++ b/drivers/gpu/drm/panthor/panthor_drv.c
> @@ -1493,6 +1493,7 @@ static void panthor_debugfs_init(struct drm_minor
> *minor)
>   * - 1.1 - adds DEV_QUERY_TIMESTAMP_INFO query
>   * - 1.2 - adds DEV_QUERY_GROUP_PRIORITIES_INFO query
>   *       - adds PANTHOR_GROUP_PRIORITY_REALTIME priority
> + * - 1.3 - adds DRM_PANTHOR_GROUP_STATE_INNOCENT flag
>   */
> 
> I'll send a v4 addressing that, but I'll probably queue the first two
> patches to drm-misc-fixes first.

You can also add my R-b and push the whole series.

Best regards,
Liviu

> 
> > @@ -1507,7 +1507,7 @@ static const struct drm_driver panthor_drm_driver = {
> >  	.desc = "Panthor DRM driver",
> >  	.date = "20230801",
> >  	.major = 1,
> > -	.minor = 2,
> > +	.minor = 3,
> >  
> >  	.gem_create_object = panthor_gem_create_object,
> >  	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
> > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
> > index ef4bec7ff9c7..97ed5fe5a191 100644
> > --- a/drivers/gpu/drm/panthor/panthor_sched.c
> > +++ b/drivers/gpu/drm/panthor/panthor_sched.c
> > @@ -610,6 +610,16 @@ struct panthor_group {
> >  	 */
> >  	bool timedout;
> >  
> > +	/**
> > +	 * @innocent: True when the group becomes unusable because the group suspension
> > +	 * failed during a reset.
> > +	 *
> > +	 * Sometimes the FW was put in a bad state by other groups, causing the group
> > +	 * suspension happening in the reset path to fail. In that case, we consider the
> > +	 * group innocent.
> > +	 */
> > +	bool innocent;
> > +
> >  	/**
> >  	 * @syncobjs: Pool of per-queue synchronization objects.
> >  	 *
> > @@ -2690,6 +2700,12 @@ void panthor_sched_suspend(struct panthor_device *ptdev)
> >  			u32 csg_id = ffs(slot_mask) - 1;
> >  			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
> >  
> > +			/* If the group was still usable before that point, we consider
> > +			 * it innocent.
> > +			 */
> > +			if (group_can_run(csg_slot->group))
> > +				csg_slot->group->innocent = true;
> > +
> >  			/* We consider group suspension failures as fatal and flag the
> >  			 * group as unusable by setting timedout=true.
> >  			 */
> > @@ -3570,6 +3586,8 @@ int panthor_group_get_state(struct panthor_file *pfile,
> >  		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
> >  		get_state->fatal_queues = group->fatal_queues;
> >  	}
> > +	if (group->innocent)
> > +		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
> >  	mutex_unlock(&sched->lock);
> >  
> >  	group_put(group);
> > diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> > index 87c9cb555dd1..b99763cbae48 100644
> > --- a/include/uapi/drm/panthor_drm.h
> > +++ b/include/uapi/drm/panthor_drm.h
> > @@ -923,6 +923,15 @@ enum drm_panthor_group_state_flags {
> >  	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
> >  	 */
> >  	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
> > +
> > +	/**
> > +	 * @DRM_PANTHOR_GROUP_STATE_INNOCENT: Group was killed during a reset caused by other
> > +	 * groups.
> > +	 *
> > +	 * This flag can only be set if DRM_PANTHOR_GROUP_STATE_TIMEDOUT is set and
> > +	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
> > +	 */
> > +	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
> >  };
> >  
> >  /**
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index ac7e53f6e3f0..f1dff7e0173d 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1507,7 +1507,7 @@  static const struct drm_driver panthor_drm_driver = {
 	.desc = "Panthor DRM driver",
 	.date = "20230801",
 	.major = 1,
-	.minor = 2,
+	.minor = 3,
 
 	.gem_create_object = panthor_gem_create_object,
 	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index ef4bec7ff9c7..97ed5fe5a191 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -610,6 +610,16 @@  struct panthor_group {
 	 */
 	bool timedout;
 
+	/**
+	 * @innocent: True when the group becomes unusable because the group suspension
+	 * failed during a reset.
+	 *
+	 * Sometimes the FW was put in a bad state by other groups, causing the group
+	 * suspension happening in the reset path to fail. In that case, we consider the
+	 * group innocent.
+	 */
+	bool innocent;
+
 	/**
 	 * @syncobjs: Pool of per-queue synchronization objects.
 	 *
@@ -2690,6 +2700,12 @@  void panthor_sched_suspend(struct panthor_device *ptdev)
 			u32 csg_id = ffs(slot_mask) - 1;
 			struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id];
 
+			/* If the group was still usable before that point, we consider
+			 * it innocent.
+			 */
+			if (group_can_run(csg_slot->group))
+				csg_slot->group->innocent = true;
+
 			/* We consider group suspension failures as fatal and flag the
 			 * group as unusable by setting timedout=true.
 			 */
@@ -3570,6 +3586,8 @@  int panthor_group_get_state(struct panthor_file *pfile,
 		get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT;
 		get_state->fatal_queues = group->fatal_queues;
 	}
+	if (group->innocent)
+		get_state->state |= DRM_PANTHOR_GROUP_STATE_INNOCENT;
 	mutex_unlock(&sched->lock);
 
 	group_put(group);
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 87c9cb555dd1..b99763cbae48 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -923,6 +923,15 @@  enum drm_panthor_group_state_flags {
 	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
 	 */
 	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_INNOCENT: Group was killed during a reset caused by other
+	 * groups.
+	 *
+	 * This flag can only be set if DRM_PANTHOR_GROUP_STATE_TIMEDOUT is set and
+	 * DRM_PANTHOR_GROUP_STATE_FATAL_FAULT is not.
+	 */
+	DRM_PANTHOR_GROUP_STATE_INNOCENT = 1 << 2,
 };
 
 /**