diff mbox series

[4/4] drm/xe/xe_drm_client: Add per drm client reset stats

Message ID 20250214203757.27895-5-jonathan.cavitt@intel.com (mailing list archive)
State New
Headers show
Series drm/xe/xe_drm_client: Add per drm client reset stats | expand

Commit Message

Jonathan Cavitt Feb. 14, 2025, 8:37 p.m. UTC
Add a counter to xe_drm_client that tracks the number of times the
engine has been reset since the drm client was created.

Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
---
 drivers/gpu/drm/xe/xe_drm_client.c | 2 ++
 drivers/gpu/drm/xe/xe_drm_client.h | 2 ++
 drivers/gpu/drm/xe/xe_guc_submit.c | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

Comments

Tvrtko Ursulin Feb. 18, 2025, 6:45 p.m. UTC | #1
On 14/02/2025 20:37, Jonathan Cavitt wrote:
> Add a counter to xe_drm_client that tracks the number of times the
> engine has been reset since the drm client was created.
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_drm_client.c | 2 ++
>   drivers/gpu/drm/xe/xe_drm_client.h | 2 ++
>   drivers/gpu/drm/xe/xe_guc_submit.c | 4 +++-
>   3 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index f15560d0b6ff..ecd2ce99fd19 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -492,6 +492,8 @@ static void show_blames(struct drm_printer *p, struct drm_file *file)
>   
>   	client = xef->client;
>   
> +	drm_printf(p, "drm-client-reset-count:%u\n",
> +		   atomic_read(&client->reset_count));

When drm- prefix is used keys have to be agreed in drm-usage-stats.rst. 
Therefore I suggest exploring across different drivers and seeing if 
anyone else would be interested. Maybe people who worked on the DRM 
common wedged event for example.

Or in cases when new stats are not universally useful drivers can prefix 
with xe-. We had this discussion recently with some panthor internal 
memory stats.

Regards,

Tvrtko

>   	drm_printf(p, "\n");
>   	drm_printf(p, "- Exec queue ban list -\n");
>   	spin_lock(&client->blame_lock);
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> index d21fd0b90742..c35de675ccfa 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.h
> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> @@ -53,6 +53,8 @@ struct xe_drm_client {
>   	 * Protected by @blame_lock;
>   	 */
>   	struct list_head blame_list;
> +	/** @reset_count: number of times this drm client has seen an engine reset */
> +	atomic_t reset_count;
>   #endif
>   };
>   
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index d9da5c89429e..8810abc8f04a 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1988,7 +1988,9 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>   		return -EPROTO;
>   
>   	hwe = q->hwe;
> -
> +#ifdef CONFIG_PROC_FS
> +	atomic_inc(&q->xef->client->reset_count);
> +#endif
>   	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>   		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>
Simona Vetter Feb. 19, 2025, 1:45 p.m. UTC | #2
On Tue, Feb 18, 2025 at 06:45:30PM +0000, Tvrtko Ursulin wrote:
> 
> On 14/02/2025 20:37, Jonathan Cavitt wrote:
> > Add a counter to xe_drm_client that tracks the number of times the
> > engine has been reset since the drm client was created.
> > 
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_drm_client.c | 2 ++
> >   drivers/gpu/drm/xe/xe_drm_client.h | 2 ++
> >   drivers/gpu/drm/xe/xe_guc_submit.c | 4 +++-
> >   3 files changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> > index f15560d0b6ff..ecd2ce99fd19 100644
> > --- a/drivers/gpu/drm/xe/xe_drm_client.c
> > +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> > @@ -492,6 +492,8 @@ static void show_blames(struct drm_printer *p, struct drm_file *file)
> >   	client = xef->client;
> > +	drm_printf(p, "drm-client-reset-count:%u\n",
> > +		   atomic_read(&client->reset_count));
> 
> When drm- prefix is used keys have to be agreed in drm-usage-stats.rst.
> Therefore I suggest exploring across different drivers and seeing if anyone
> else would be interested. Maybe people who worked on the DRM common wedged
> event for example.

+1 on standardizing wedge/reset tracking across drivers more. I guess
ideally we could integrate this into one thing to make sure it's
consistently reported across all drivers.
-Sima

> 
> Or in cases when new stats are not universally useful drivers can prefix
> with xe-. We had this discussion recently with some panthor internal memory
> stats.
> 
> Regards,
> 
> Tvrtko
> 
> >   	drm_printf(p, "\n");
> >   	drm_printf(p, "- Exec queue ban list -\n");
> >   	spin_lock(&client->blame_lock);
> > diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> > index d21fd0b90742..c35de675ccfa 100644
> > --- a/drivers/gpu/drm/xe/xe_drm_client.h
> > +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> > @@ -53,6 +53,8 @@ struct xe_drm_client {
> >   	 * Protected by @blame_lock;
> >   	 */
> >   	struct list_head blame_list;
> > +	/** @reset_count: number of times this drm client has seen an engine reset */
> > +	atomic_t reset_count;
> >   #endif
> >   };
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index d9da5c89429e..8810abc8f04a 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1988,7 +1988,9 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >   		return -EPROTO;
> >   	hwe = q->hwe;
> > -
> > +#ifdef CONFIG_PROC_FS
> > +	atomic_inc(&q->xef->client->reset_count);
> > +#endif
> >   	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> >   		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
index f15560d0b6ff..ecd2ce99fd19 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.c
+++ b/drivers/gpu/drm/xe/xe_drm_client.c
@@ -492,6 +492,8 @@  static void show_blames(struct drm_printer *p, struct drm_file *file)
 
 	client = xef->client;
 
+	drm_printf(p, "drm-client-reset-count:%u\n",
+		   atomic_read(&client->reset_count));
 	drm_printf(p, "\n");
 	drm_printf(p, "- Exec queue ban list -\n");
 	spin_lock(&client->blame_lock);
diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
index d21fd0b90742..c35de675ccfa 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.h
+++ b/drivers/gpu/drm/xe/xe_drm_client.h
@@ -53,6 +53,8 @@  struct xe_drm_client {
 	 * Protected by @blame_lock;
 	 */
 	struct list_head blame_list;
+	/** @reset_count: number of times this drm client has seen an engine reset */
+	atomic_t reset_count;
 #endif
 };
 
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index d9da5c89429e..8810abc8f04a 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -1988,7 +1988,9 @@  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
 		return -EPROTO;
 
 	hwe = q->hwe;
-
+#ifdef CONFIG_PROC_FS
+	atomic_inc(&q->xef->client->reset_count);
+#endif
 	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
 		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);