diff mbox series

[3/4] FIXME: drm/xe/xe_drm_client: Add per drm client pagefault info

Message ID 20250214203757.27895-4-jonathan.cavitt@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/xe/xe_drm_client: Add per drm client reset stats | expand

Commit Message

Cavitt, Jonathan Feb. 14, 2025, 8:37 p.m. UTC
Add additional information to drm client so it can report the last 50
exec queues to have been banned on it, as well as the last pagefault
seen when said exec queues were banned.  Since we cannot reasonably
associate a pagefault to a specific exec queue, we currently report the
last seen pagefault on the associated hw engine instead.

The last pagefault seen per exec queue is saved to the hw engine, and the
pagefault is updated during the pagefault handling process in
xe_gt_pagefault.  The last seen pagefault is reset when the engine is
reset because any future exec queue bans likely were not caused by said
pagefault after the reset.

v2: Remove exec queue from blame list on destroy and recreate (Joonas)

Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
---
 drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
 drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
 drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
 drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
 drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
 drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
 7 files changed, 219 insertions(+)

Comments

kernel test robot Feb. 16, 2025, 4:04 a.m. UTC | #1
Hi Jonathan,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-xe/drm-xe-next]
[also build test WARNING on drm-exynos/exynos-drm-next linus/master v6.14-rc2 next-20250214]
[cannot apply to drm/drm-next drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-misc/drm-misc-next drm-tip/drm-tip]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jonathan-Cavitt/drm-xe-xe_exec_queue-Add-ID-param-to-exec-queue-struct/20250215-043933
base:   https://gitlab.freedesktop.org/drm/xe/kernel.git drm-xe-next
patch link:    https://lore.kernel.org/r/20250214203757.27895-4-jonathan.cavitt%40intel.com
patch subject: [PATCH 3/4] FIXME: drm/xe/xe_drm_client: Add per drm client pagefault info
config: alpha-randconfig-r051-20250216 (https://download.01.org/0day-ci/archive/20250216/202502161130.qHItt8dh-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 14.2.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502161130.qHItt8dh-lkp@intel.com/

cocci warnings: (new ones prefixed by >>)
>> drivers/gpu/drm/xe/xe_gt_pagefault.c:342:3-8: WARNING: NULL check before some freeing functions is not needed.

vim +342 drivers/gpu/drm/xe/xe_gt_pagefault.c

   332	
   333	static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
   334	{
   335		struct xe_hw_engine *hwe;
   336	
   337		hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
   338		if (hwe) {
   339			spin_lock(&hwe->pf.lock);
   340			/** The latest pagefault is pf, so remove old pf info from engine */
   341			if (hwe->pf.info)
 > 342				kfree(hwe->pf.info);
   343			hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
   344			if (hwe->pf.info)
   345				memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
   346			spin_unlock(&hwe->pf.lock);
   347		}
   348	}
   349
Simona Vetter Feb. 17, 2025, 5:01 p.m. UTC | #2
On Fri, Feb 14, 2025 at 08:37:56PM +0000, Jonathan Cavitt wrote:
> Add additional information to drm client so it can report the last 50
> exec queues to have been banned on it, as well as the last pagefault
> seen when said exec queues were banned.  Since we cannot reasonably
> associate a pagefault to a specific exec queue, we currently report the
> last seen pagefault on the associated hw engine instead.
> 
> The last pagefault seen per exec queue is saved to the hw engine, and the
> pagefault is updated during the pagefault handling process in
> xe_gt_pagefault.  The last seen pagefault is reset when the engine is
> reset because any future exec queue bans likely were not caused by said
> pagefault after the reset.
> 
> v2: Remove exec queue from blame list on destroy and recreate (Joonas)
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
>  drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
>  drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
>  drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
>  drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
>  drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
>  7 files changed, 219 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index 2d4874d2b922..f15560d0b6ff 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -17,6 +17,7 @@
>  #include "xe_exec_queue.h"
>  #include "xe_force_wake.h"
>  #include "xe_gt.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_hw_engine.h"
>  #include "xe_pm.h"
>  #include "xe_trace.h"
> @@ -70,6 +71,21 @@
>   * 	drm-total-cycles-ccs:   7655183225
>   * 	drm-engine-capacity-ccs:        4
>   *
> + * 	- Exec queue ban list -

This looks like you're just totally tossing the drm fdinfo format and
going with something no one else can even parse.

I think it's time for some proper helper functions/macros and not
open-coded drm_printf for this stuff.

Also for sure needs an ack from Tvrtko.

Cheers, Sima

> + *
> + * 		Exec queue 1 banned:
> + * 			Associated pagefault:
> + * 			ASID: 9
> + * 			VFID: 0
> + * 			PDATA: 0x0450
> + * 			Faulted Address: 0x000001fff86a9000
> + * 			FaultType: NOT_PRESENT
> + * 			AccessType: ACCESS_TYPE_WRITE
> + * 			FaultLevel: 0
> + * 			EngineClass: 1 vcs
> + * 			EngineInstance: 0
> + *		
> + *
>   * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
>   * "other".
>   */
> @@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
>  #ifdef CONFIG_PROC_FS
>  	spin_lock_init(&client->bos_lock);
>  	INIT_LIST_HEAD(&client->bos_list);
> +	spin_lock_init(&client->blame_lock);
> +	INIT_LIST_HEAD(&client->blame_list);
>  #endif
>  	return client;
>  }
> @@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
>  	xe_drm_client_put(client);
>  }
>  
> +static void free_blame(struct blame *b)
> +{
> +	list_del(&b->list);
> +	kfree(b->pf);
> +	kfree(b);
> +}
> +
> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> +			     struct xe_exec_queue *q)
> +{
> +	struct blame *b = NULL;
> +	struct list_head *h;
> +	struct pagefault *pf = NULL;
> +	struct xe_file *xef = q->xef;
> +	struct xe_hw_engine *hwe = q->hwe;
> +	unsigned long count;
> +
> +	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
> +	xe_assert(xef->xe, b);
> +
> +	spin_lock(&client->blame_lock);
> +	list_add_tail(&b->list, &client->blame_list);
> +	/**
> +	 * Limit the number of blames in the blame list to prevent memory overuse.
> +	 *
> +	 * TODO: Parameterize max blame list size.
> +	 */
> +	count = 0;
> +	list_for_each(h, &client->blame_list)
> +		count++;
> +	if (count >= 50) {
> +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
> +		free_blame(rem);
> +	}
> +	spin_unlock(&client->blame_lock);
> +
> +	/**
> +	 * Duplicate pagefault on engine to blame, if one may have caused the
> +	 * exec queue to be banned.
> +	 */
> +	b->pf = NULL;
> +	spin_lock(&hwe->pf.lock);
> +	if (hwe->pf.info) {
> +		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
> +	}
> +	spin_unlock(&hwe->pf.lock);
> +
> +	/** Save blame data to list element */
> +	b->exec_queue_id = q->id;
> +	b->pf = pf;
> +}
> +
> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +				struct xe_exec_queue *q)
> +{
> +	struct blame *b, *tmp;
> +
> +	spin_lock(&client->blame_lock);
> +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
> +		if (b->exec_queue_id == q->id)
> +			free_blame(b);
> +	spin_unlock(&client->blame_lock);
> +
> +}
> +
>  static void bo_meminfo(struct xe_bo *bo,
>  		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
>  {
> @@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  	}
>  }
>  
> +static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
> +{
> +	drm_printf(p, "\n\t\tASID: %d\n"
> +	 "\t\tVFID: %d\n"
> +	 "\t\tPDATA: 0x%04x\n"
> +	 "\t\tFaulted Address: 0x%08x%08x\n"
> +	 "\t\tFaultType: %s\n"
> +	 "\t\tAccessType: %s\n"
> +	 "\t\tFaultLevel: %d\n"
> +	 "\t\tEngineClass: %d %s\n"
> +	 "\t\tEngineInstance: %d\n",
> +	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
> +	 lower_32_bits(pf->page_addr),
> +	 fault_type_to_str(pf->fault_type),
> +	 access_type_to_str(pf->access_type),
> +	 pf->fault_level, pf->engine_class,
> +	 xe_hw_engine_class_to_str(pf->engine_class),
> +	 pf->engine_instance);
> +}
> +
> +static void show_blames(struct drm_printer *p, struct drm_file *file)
> +{
> +	struct xe_file *xef = file->driver_priv;
> +	struct xe_drm_client *client;
> +	struct blame *b;
> +
> +	client = xef->client;
> +
> +	drm_printf(p, "\n");
> +	drm_printf(p, "- Exec queue ban list -\n");
> +	spin_lock(&client->blame_lock);
> +	list_for_each_entry(b, &client->blame_list, list) {
> +		struct pagefault *pf = b->pf;
> +		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
> +		drm_printf(p, "\t\tAssociated pagefault:\n");
> +		if (pf)
> +			print_pagefault(p, pf);
> +		else
> +			drm_printf(p, "\t\t- No associated pagefault -\n");
> +	}
> +	spin_unlock(&client->blame_lock);
> +}
> +
>  /**
>   * xe_drm_client_fdinfo() - Callback for fdinfo interface
>   * @p: The drm_printer ptr
> @@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
>  {
>  	show_meminfo(p, file);
>  	show_run_ticks(p, file);
> +	show_blames(p, file);
>  }
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> index a9649aa36011..d21fd0b90742 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.h
> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> @@ -15,7 +15,18 @@
>  
>  struct drm_file;
>  struct drm_printer;
> +struct pagefault;
>  struct xe_bo;
> +struct xe_exec_queue;
> +
> +struct blame {
> +	/** @exec_queue_id: ID number of banned exec queue */
> +	u32 exec_queue_id;
> +	/** @pf: pagefault on engine of banned exec queue, if any at time */
> +	struct pagefault *pf;
> +	/** @list: link into @xe_drm_client.blame_list */
> +	struct list_head list;
> +};
>  
>  struct xe_drm_client {
>  	struct kref kref;
> @@ -31,6 +42,17 @@ struct xe_drm_client {
>  	 * Protected by @bos_lock.
>  	 */
>  	struct list_head bos_list;
> +	/**
> +	 * @blame_lock: lock protecting @blame_list
> +	 */
> +	spinlock_t blame_lock;
> +	/**
> +	 * @blame_list: list of banned exec queues associated with this drm
> +	 *		client, as well as any pagefaults at time of ban.
> +	 *
> +	 * Protected by @blame_lock;
> +	 */
> +	struct list_head blame_list;
>  #endif
>  };
>  
> @@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
>  void xe_drm_client_add_bo(struct xe_drm_client *client,
>  			  struct xe_bo *bo);
>  void xe_drm_client_remove_bo(struct xe_bo *bo);
> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> +			     struct xe_exec_queue *q);
> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +				struct xe_exec_queue *q);
>  #else
>  static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>  					struct xe_bo *bo)
> @@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>  static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
>  {
>  }
> +
> +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
> +					   struct xe_exec_queue *q)
> +{
> +}
> +
> +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
> +					      struct xe_exec_queue *q)
> +{
> +}
>  #endif
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> index a02e62465e01..9c9bc617020c 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> @@ -13,6 +13,7 @@
>  #include <uapi/drm/xe_drm.h>
>  
>  #include "xe_device.h"
> +#include "xe_drm_client.h"
>  #include "xe_gt.h"
>  #include "xe_hw_engine_class_sysfs.h"
>  #include "xe_hw_engine_group.h"
> @@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>  	q->id = id;
>  	args->exec_queue_id = id;
>  
> +	/**
> +	 * If an exec queue in the blame list shares the same exec queue
> +	 * ID, remove it from the blame list to avoid confusion.
> +	 */
> +	xe_drm_client_remove_blame(q->xef->client, q);
> +
>  	return 0;
>  
>  kill_exec_queue:
> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> index fe18e3ec488a..a0e6f2281e37 100644
> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> @@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	return full ? -ENOSPC : 0;
>  }
>  
> +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
> +{
> +	struct xe_hw_engine *hwe;
> +
> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> +	if (hwe) {
> +		spin_lock(&hwe->pf.lock);
> +		/** The latest pagefault is pf, so remove old pf info from engine */
> +		if (hwe->pf.info)
> +			kfree(hwe->pf.info);
> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> +		if (hwe->pf.info)
> +			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
> +		spin_unlock(&hwe->pf.lock);
> +	}
> +}
> +
>  #define USM_QUEUE_MAX_RUNTIME_MS	20
>  
>  static void pf_queue_work_func(struct work_struct *w)
> @@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
>  			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
>  		}
>  
> +		save_pagefault_to_engine(gt, &pf);
> +
>  		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
>  			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
>  			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 913c74d6e2ae..d9da5c89429e 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -20,11 +20,13 @@
>  #include "xe_assert.h"
>  #include "xe_devcoredump.h"
>  #include "xe_device.h"
> +#include "xe_drm_client.h"
>  #include "xe_exec_queue.h"
>  #include "xe_force_wake.h"
>  #include "xe_gpu_scheduler.h"
>  #include "xe_gt.h"
>  #include "xe_gt_clock.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_gt_printk.h"
>  #include "xe_guc.h"
>  #include "xe_guc_capture.h"
> @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
>  static void set_exec_queue_banned(struct xe_exec_queue *q)
>  {
>  	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
> +	xe_drm_client_add_blame(q->xef->client, q);
>  }
>  
>  static bool exec_queue_suspended(struct xe_exec_queue *q)
> @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  {
>  	struct xe_gt *gt = guc_to_gt(guc);
> +	struct xe_hw_engine *hwe;
>  	struct xe_exec_queue *q;
>  	u32 guc_id;
>  
> @@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>  	if (unlikely(!q))
>  		return -EPROTO;
>  
> +	hwe = q->hwe;
> +
>  	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>  		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>  
>  	trace_xe_exec_queue_reset(q);
>  
> +	/**
> +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
> +	 * not caused by said pagefault now that the engine has reset.
> +	 */
> +	spin_lock(&hwe->pf.lock);
> +	if (hwe->pf.info) {
> +		kfree(hwe->pf.info);
> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> +	}
> +	spin_unlock(&hwe->pf.lock);
> +
>  	/*
>  	 * A banned engine is a NOP at this point (came from
>  	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> index fc447751fe78..69f61e4905e2 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> @@ -21,6 +21,7 @@
>  #include "xe_gsc.h"
>  #include "xe_gt.h"
>  #include "xe_gt_ccs_mode.h"
> +#include "xe_gt_pagefault.h"
>  #include "xe_gt_printk.h"
>  #include "xe_gt_mcr.h"
>  #include "xe_gt_topology.h"
> @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>  		hwe->eclass->defaults = hwe->eclass->sched_props;
>  	}
>  
> +	hwe->pf.info = NULL;
> +	spin_lock_init(&hwe->pf.lock);
> +
>  	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
>  	xe_tuning_process_engine(hwe);
>  	xe_wa_process_engine(hwe);
> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> index e4191a7a2c31..2e1be9481d9b 100644
> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
>  struct xe_bo;
>  struct xe_execlist_port;
>  struct xe_gt;
> +struct pagefault;
>  
>  /**
>   * struct xe_hw_engine_class_intf - per hw engine class struct interface
> @@ -150,6 +151,13 @@ struct xe_hw_engine {
>  	struct xe_oa_unit *oa_unit;
>  	/** @hw_engine_group: the group of hw engines this one belongs to */
>  	struct xe_hw_engine_group *hw_engine_group;
> +	/** @pf: the last pagefault seen on this engine */
> +	struct {
> +		/** @pf.info: info containing last seen pagefault details */
> +		struct pagefault *info;
> +		/** @pf.lock: lock protecting @pf.info */
> +		spinlock_t lock;
> +	} pf;
>  };
>  
>  enum xe_hw_engine_snapshot_source_id {
> -- 
> 2.43.0
>
Cavitt, Jonathan Feb. 18, 2025, 3:11 p.m. UTC | #3
-----Original Message-----
From: Simona Vetter <simona.vetter@ffwll.ch> 
Sent: Monday, February 17, 2025 9:01 AM
To: Cavitt, Jonathan <jonathan.cavitt@intel.com>
Cc: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Lahtinen, Joonas <joonas.lahtinen@intel.com>; De Marchi, Lucas <lucas.demarchi@intel.com>; Brost, Matthew <matthew.brost@intel.com>; tvrtko.ursulin@igalia.com
Subject: Re: [PATCH 3/4] FIXME: drm/xe/xe_drm_client: Add per drm client pagefault info
> 
> On Fri, Feb 14, 2025 at 08:37:56PM +0000, Jonathan Cavitt wrote:
> > Add additional information to drm client so it can report the last 50
> > exec queues to have been banned on it, as well as the last pagefault
> > seen when said exec queues were banned.  Since we cannot reasonably
> > associate a pagefault to a specific exec queue, we currently report the
> > last seen pagefault on the associated hw engine instead.
> > 
> > The last pagefault seen per exec queue is saved to the hw engine, and the
> > pagefault is updated during the pagefault handling process in
> > xe_gt_pagefault.  The last seen pagefault is reset when the engine is
> > reset because any future exec queue bans likely were not caused by said
> > pagefault after the reset.
> > 
> > v2: Remove exec queue from blame list on destroy and recreate (Joonas)
> > 
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
> >  drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
> >  drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
> >  drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
> >  drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
> >  drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
> >  7 files changed, 219 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> > index 2d4874d2b922..f15560d0b6ff 100644
> > --- a/drivers/gpu/drm/xe/xe_drm_client.c
> > +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> > @@ -17,6 +17,7 @@
> >  #include "xe_exec_queue.h"
> >  #include "xe_force_wake.h"
> >  #include "xe_gt.h"
> > +#include "xe_gt_pagefault.h"
> >  #include "xe_hw_engine.h"
> >  #include "xe_pm.h"
> >  #include "xe_trace.h"
> > @@ -70,6 +71,21 @@
> >   * 	drm-total-cycles-ccs:   7655183225
> >   * 	drm-engine-capacity-ccs:        4
> >   *
> > + * 	- Exec queue ban list -
> 
> This looks like you're just totally tossing the drm fdinfo format and
> going with something no one else can even parse.
> 
> I think it's time for some proper helper functions/macros and not
> open-coded drm_printf for this stuff.

Understood.  I'll see what I can do about aligning the output with
what currently exists.
-Jonathan Cavitt

> 
> Also for sure needs an ack from Tvrtko.
> 
> Cheers, Sima
> 
> > + *
> > + * 		Exec queue 1 banned:
> > + * 			Associated pagefault:
> > + * 			ASID: 9
> > + * 			VFID: 0
> > + * 			PDATA: 0x0450
> > + * 			Faulted Address: 0x000001fff86a9000
> > + * 			FaultType: NOT_PRESENT
> > + * 			AccessType: ACCESS_TYPE_WRITE
> > + * 			FaultLevel: 0
> > + * 			EngineClass: 1 vcs
> > + * 			EngineInstance: 0
> > + *		
> > + *
> >   * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
> >   * "other".
> >   */
> > @@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
> >  #ifdef CONFIG_PROC_FS
> >  	spin_lock_init(&client->bos_lock);
> >  	INIT_LIST_HEAD(&client->bos_list);
> > +	spin_lock_init(&client->blame_lock);
> > +	INIT_LIST_HEAD(&client->blame_list);
> >  #endif
> >  	return client;
> >  }
> > @@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
> >  	xe_drm_client_put(client);
> >  }
> >  
> > +static void free_blame(struct blame *b)
> > +{
> > +	list_del(&b->list);
> > +	kfree(b->pf);
> > +	kfree(b);
> > +}
> > +
> > +void xe_drm_client_add_blame(struct xe_drm_client *client,
> > +			     struct xe_exec_queue *q)
> > +{
> > +	struct blame *b = NULL;
> > +	struct list_head *h;
> > +	struct pagefault *pf = NULL;
> > +	struct xe_file *xef = q->xef;
> > +	struct xe_hw_engine *hwe = q->hwe;
> > +	unsigned long count;
> > +
> > +	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
> > +	xe_assert(xef->xe, b);
> > +
> > +	spin_lock(&client->blame_lock);
> > +	list_add_tail(&b->list, &client->blame_list);
> > +	/**
> > +	 * Limit the number of blames in the blame list to prevent memory overuse.
> > +	 *
> > +	 * TODO: Parameterize max blame list size.
> > +	 */
> > +	count = 0;
> > +	list_for_each(h, &client->blame_list)
> > +		count++;
> > +	if (count >= 50) {
> > +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
> > +		free_blame(rem);
> > +	}
> > +	spin_unlock(&client->blame_lock);
> > +
> > +	/**
> > +	 * Duplicate pagefault on engine to blame, if one may have caused the
> > +	 * exec queue to be banned.
> > +	 */
> > +	b->pf = NULL;
> > +	spin_lock(&hwe->pf.lock);
> > +	if (hwe->pf.info) {
> > +		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
> > +	}
> > +	spin_unlock(&hwe->pf.lock);
> > +
> > +	/** Save blame data to list element */
> > +	b->exec_queue_id = q->id;
> > +	b->pf = pf;
> > +}
> > +
> > +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > +				struct xe_exec_queue *q)
> > +{
> > +	struct blame *b, *tmp;
> > +
> > +	spin_lock(&client->blame_lock);
> > +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
> > +		if (b->exec_queue_id == q->id)
> > +			free_blame(b);
> > +	spin_unlock(&client->blame_lock);
> > +
> > +}
> > +
> >  static void bo_meminfo(struct xe_bo *bo,
> >  		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
> >  {
> > @@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> >  	}
> >  }
> >  
> > +static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
> > +{
> > +	drm_printf(p, "\n\t\tASID: %d\n"
> > +	 "\t\tVFID: %d\n"
> > +	 "\t\tPDATA: 0x%04x\n"
> > +	 "\t\tFaulted Address: 0x%08x%08x\n"
> > +	 "\t\tFaultType: %s\n"
> > +	 "\t\tAccessType: %s\n"
> > +	 "\t\tFaultLevel: %d\n"
> > +	 "\t\tEngineClass: %d %s\n"
> > +	 "\t\tEngineInstance: %d\n",
> > +	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
> > +	 lower_32_bits(pf->page_addr),
> > +	 fault_type_to_str(pf->fault_type),
> > +	 access_type_to_str(pf->access_type),
> > +	 pf->fault_level, pf->engine_class,
> > +	 xe_hw_engine_class_to_str(pf->engine_class),
> > +	 pf->engine_instance);
> > +}
> > +
> > +static void show_blames(struct drm_printer *p, struct drm_file *file)
> > +{
> > +	struct xe_file *xef = file->driver_priv;
> > +	struct xe_drm_client *client;
> > +	struct blame *b;
> > +
> > +	client = xef->client;
> > +
> > +	drm_printf(p, "\n");
> > +	drm_printf(p, "- Exec queue ban list -\n");
> > +	spin_lock(&client->blame_lock);
> > +	list_for_each_entry(b, &client->blame_list, list) {
> > +		struct pagefault *pf = b->pf;
> > +		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
> > +		drm_printf(p, "\t\tAssociated pagefault:\n");
> > +		if (pf)
> > +			print_pagefault(p, pf);
> > +		else
> > +			drm_printf(p, "\t\t- No associated pagefault -\n");
> > +	}
> > +	spin_unlock(&client->blame_lock);
> > +}
> > +
> >  /**
> >   * xe_drm_client_fdinfo() - Callback for fdinfo interface
> >   * @p: The drm_printer ptr
> > @@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
> >  {
> >  	show_meminfo(p, file);
> >  	show_run_ticks(p, file);
> > +	show_blames(p, file);
> >  }
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> > index a9649aa36011..d21fd0b90742 100644
> > --- a/drivers/gpu/drm/xe/xe_drm_client.h
> > +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> > @@ -15,7 +15,18 @@
> >  
> >  struct drm_file;
> >  struct drm_printer;
> > +struct pagefault;
> >  struct xe_bo;
> > +struct xe_exec_queue;
> > +
> > +struct blame {
> > +	/** @exec_queue_id: ID number of banned exec queue */
> > +	u32 exec_queue_id;
> > +	/** @pf: pagefault on engine of banned exec queue, if any at time */
> > +	struct pagefault *pf;
> > +	/** @list: link into @xe_drm_client.blame_list */
> > +	struct list_head list;
> > +};
> >  
> >  struct xe_drm_client {
> >  	struct kref kref;
> > @@ -31,6 +42,17 @@ struct xe_drm_client {
> >  	 * Protected by @bos_lock.
> >  	 */
> >  	struct list_head bos_list;
> > +	/**
> > +	 * @blame_lock: lock protecting @blame_list
> > +	 */
> > +	spinlock_t blame_lock;
> > +	/**
> > +	 * @blame_list: list of banned exec queues associated with this drm
> > +	 *		client, as well as any pagefaults at time of ban.
> > +	 *
> > +	 * Protected by @blame_lock;
> > +	 */
> > +	struct list_head blame_list;
> >  #endif
> >  };
> >  
> > @@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
> >  void xe_drm_client_add_bo(struct xe_drm_client *client,
> >  			  struct xe_bo *bo);
> >  void xe_drm_client_remove_bo(struct xe_bo *bo);
> > +void xe_drm_client_add_blame(struct xe_drm_client *client,
> > +			     struct xe_exec_queue *q);
> > +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > +				struct xe_exec_queue *q);
> >  #else
> >  static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> >  					struct xe_bo *bo)
> > @@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> >  static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
> >  {
> >  }
> > +
> > +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
> > +					   struct xe_exec_queue *q)
> > +{
> > +}
> > +
> > +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > +					      struct xe_exec_queue *q)
> > +{
> > +}
> >  #endif
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > index a02e62465e01..9c9bc617020c 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > @@ -13,6 +13,7 @@
> >  #include <uapi/drm/xe_drm.h>
> >  
> >  #include "xe_device.h"
> > +#include "xe_drm_client.h"
> >  #include "xe_gt.h"
> >  #include "xe_hw_engine_class_sysfs.h"
> >  #include "xe_hw_engine_group.h"
> > @@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >  	q->id = id;
> >  	args->exec_queue_id = id;
> >  
> > +	/**
> > +	 * If an exec queue in the blame list shares the same exec queue
> > +	 * ID, remove it from the blame list to avoid confusion.
> > +	 */
> > +	xe_drm_client_remove_blame(q->xef->client, q);
> > +
> >  	return 0;
> >  
> >  kill_exec_queue:
> > diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > index fe18e3ec488a..a0e6f2281e37 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > @@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  	return full ? -ENOSPC : 0;
> >  }
> >  
> > +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
> > +{
> > +	struct xe_hw_engine *hwe;
> > +
> > +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> > +	if (hwe) {
> > +		spin_lock(&hwe->pf.lock);
> > +		/** The latest pagefault is pf, so remove old pf info from engine */
> > +		if (hwe->pf.info)
> > +			kfree(hwe->pf.info);
> > +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > +		if (hwe->pf.info)
> > +			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
> > +		spin_unlock(&hwe->pf.lock);
> > +	}
> > +}
> > +
> >  #define USM_QUEUE_MAX_RUNTIME_MS	20
> >  
> >  static void pf_queue_work_func(struct work_struct *w)
> > @@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
> >  			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
> >  		}
> >  
> > +		save_pagefault_to_engine(gt, &pf);
> > +
> >  		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
> >  			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
> >  			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 913c74d6e2ae..d9da5c89429e 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -20,11 +20,13 @@
> >  #include "xe_assert.h"
> >  #include "xe_devcoredump.h"
> >  #include "xe_device.h"
> > +#include "xe_drm_client.h"
> >  #include "xe_exec_queue.h"
> >  #include "xe_force_wake.h"
> >  #include "xe_gpu_scheduler.h"
> >  #include "xe_gt.h"
> >  #include "xe_gt_clock.h"
> > +#include "xe_gt_pagefault.h"
> >  #include "xe_gt_printk.h"
> >  #include "xe_guc.h"
> >  #include "xe_guc_capture.h"
> > @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
> >  static void set_exec_queue_banned(struct xe_exec_queue *q)
> >  {
> >  	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
> > +	xe_drm_client_add_blame(q->xef->client, q);
> >  }
> >  
> >  static bool exec_queue_suspended(struct xe_exec_queue *q)
> > @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  {
> >  	struct xe_gt *gt = guc_to_gt(guc);
> > +	struct xe_hw_engine *hwe;
> >  	struct xe_exec_queue *q;
> >  	u32 guc_id;
> >  
> > @@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >  	if (unlikely(!q))
> >  		return -EPROTO;
> >  
> > +	hwe = q->hwe;
> > +
> >  	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> >  		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> >  
> >  	trace_xe_exec_queue_reset(q);
> >  
> > +	/**
> > +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
> > +	 * not caused by said pagefault now that the engine has reset.
> > +	 */
> > +	spin_lock(&hwe->pf.lock);
> > +	if (hwe->pf.info) {
> > +		kfree(hwe->pf.info);
> > +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > +	}
> > +	spin_unlock(&hwe->pf.lock);
> > +
> >  	/*
> >  	 * A banned engine is a NOP at this point (came from
> >  	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
> > diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> > index fc447751fe78..69f61e4905e2 100644
> > --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> > +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> > @@ -21,6 +21,7 @@
> >  #include "xe_gsc.h"
> >  #include "xe_gt.h"
> >  #include "xe_gt_ccs_mode.h"
> > +#include "xe_gt_pagefault.h"
> >  #include "xe_gt_printk.h"
> >  #include "xe_gt_mcr.h"
> >  #include "xe_gt_topology.h"
> > @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
> >  		hwe->eclass->defaults = hwe->eclass->sched_props;
> >  	}
> >  
> > +	hwe->pf.info = NULL;
> > +	spin_lock_init(&hwe->pf.lock);
> > +
> >  	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
> >  	xe_tuning_process_engine(hwe);
> >  	xe_wa_process_engine(hwe);
> > diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > index e4191a7a2c31..2e1be9481d9b 100644
> > --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
> >  struct xe_bo;
> >  struct xe_execlist_port;
> >  struct xe_gt;
> > +struct pagefault;
> >  
> >  /**
> >   * struct xe_hw_engine_class_intf - per hw engine class struct interface
> > @@ -150,6 +151,13 @@ struct xe_hw_engine {
> >  	struct xe_oa_unit *oa_unit;
> >  	/** @hw_engine_group: the group of hw engines this one belongs to */
> >  	struct xe_hw_engine_group *hw_engine_group;
> > +	/** @pf: the last pagefault seen on this engine */
> > +	struct {
> > +		/** @pf.info: info containing last seen pagefault details */
> > +		struct pagefault *info;
> > +		/** @pf.lock: lock protecting @pf.info */
> > +		spinlock_t lock;
> > +	} pf;
> >  };
> >  
> >  enum xe_hw_engine_snapshot_source_id {
> > -- 
> > 2.43.0
> > 
> 
> -- 
> Simona Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
>
Tvrtko Ursulin Feb. 18, 2025, 6:39 p.m. UTC | #4
On 17/02/2025 17:01, Simona Vetter wrote:
> On Fri, Feb 14, 2025 at 08:37:56PM +0000, Jonathan Cavitt wrote:
>> Add additional information to drm client so it can report the last 50
>> exec queues to have been banned on it, as well as the last pagefault
>> seen when said exec queues were banned.  Since we cannot reasonably
>> associate a pagefault to a specific exec queue, we currently report the
>> last seen pagefault on the associated hw engine instead.
>>
>> The last pagefault seen per exec queue is saved to the hw engine, and the
>> pagefault is updated during the pagefault handling process in
>> xe_gt_pagefault.  The last seen pagefault is reset when the engine is
>> reset because any future exec queue bans likely were not caused by said
>> pagefault after the reset.
>>
>> v2: Remove exec queue from blame list on destroy and recreate (Joonas)
>>
>> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
>>   drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
>>   drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
>>   drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
>>   drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
>>   drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
>>   7 files changed, 219 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
>> index 2d4874d2b922..f15560d0b6ff 100644
>> --- a/drivers/gpu/drm/xe/xe_drm_client.c
>> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
>> @@ -17,6 +17,7 @@
>>   #include "xe_exec_queue.h"
>>   #include "xe_force_wake.h"
>>   #include "xe_gt.h"
>> +#include "xe_gt_pagefault.h"
>>   #include "xe_hw_engine.h"
>>   #include "xe_pm.h"
>>   #include "xe_trace.h"
>> @@ -70,6 +71,21 @@
>>    * 	drm-total-cycles-ccs:   7655183225
>>    * 	drm-engine-capacity-ccs:        4
>>    *
>> + * 	- Exec queue ban list -
> 
> This looks like you're just totally tossing the drm fdinfo format and
> going with something no one else can even parse.
> 
> I think it's time for some proper helper functions/macros and not
> open-coded drm_printf for this stuff.
> 
> Also for sure needs an ack from Tvrtko.
> 
> Cheers, Sima
> 
>> + *
>> + * 		Exec queue 1 banned:
>> + * 			Associated pagefault:
>> + * 			ASID: 9
>> + * 			VFID: 0
>> + * 			PDATA: 0x0450
>> + * 			Faulted Address: 0x000001fff86a9000
>> + * 			FaultType: NOT_PRESENT
>> + * 			AccessType: ACCESS_TYPE_WRITE
>> + * 			FaultLevel: 0
>> + * 			EngineClass: 1 vcs
>> + * 			EngineInstance: 0
>> + *		
>> + *
>>    * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
>>    * "other".

 From one angle one could say the new data is not drm fdinfo compliant 
and so existing parsers will just skip over it and do nothing. Which is 
not a problem per se.

So without going into any analysis of what data is exposed here and its 
usefulness maybe drm fdinfo does not care.

Then from another angle, going slightly deeper into what is proposed 
here, it _seems_ to be adding some sort of logging to fdinfo which is 
way different than current single counters paradigm. I wonder if there 
is precedent anywhere in the kernel for stuffing logs into fdinfo?

My first reaction would be that it feels a questionable approach (how 
will userspace correlate "Exec queue *N*" to something?). I guess more 
info on how is this useful for userspace should be added to the patch 
description.

Regards,

Tvrtko

>>    */
>> @@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
>>   #ifdef CONFIG_PROC_FS
>>   	spin_lock_init(&client->bos_lock);
>>   	INIT_LIST_HEAD(&client->bos_list);
>> +	spin_lock_init(&client->blame_lock);
>> +	INIT_LIST_HEAD(&client->blame_list);
>>   #endif
>>   	return client;
>>   }
>> @@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
>>   	xe_drm_client_put(client);
>>   }
>>   
>> +static void free_blame(struct blame *b)
>> +{
>> +	list_del(&b->list);
>> +	kfree(b->pf);
>> +	kfree(b);
>> +}
>> +
>> +void xe_drm_client_add_blame(struct xe_drm_client *client,
>> +			     struct xe_exec_queue *q)
>> +{
>> +	struct blame *b = NULL;
>> +	struct list_head *h;
>> +	struct pagefault *pf = NULL;
>> +	struct xe_file *xef = q->xef;
>> +	struct xe_hw_engine *hwe = q->hwe;
>> +	unsigned long count;
>> +
>> +	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
>> +	xe_assert(xef->xe, b);
>> +
>> +	spin_lock(&client->blame_lock);
>> +	list_add_tail(&b->list, &client->blame_list);
>> +	/**
>> +	 * Limit the number of blames in the blame list to prevent memory overuse.
>> +	 *
>> +	 * TODO: Parameterize max blame list size.
>> +	 */
>> +	count = 0;
>> +	list_for_each(h, &client->blame_list)
>> +		count++;
>> +	if (count >= 50) {
>> +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
>> +		free_blame(rem);
>> +	}
>> +	spin_unlock(&client->blame_lock);
>> +
>> +	/**
>> +	 * Duplicate pagefault on engine to blame, if one may have caused the
>> +	 * exec queue to be banned.
>> +	 */
>> +	b->pf = NULL;
>> +	spin_lock(&hwe->pf.lock);
>> +	if (hwe->pf.info) {
>> +		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
>> +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
>> +	}
>> +	spin_unlock(&hwe->pf.lock);
>> +
>> +	/** Save blame data to list element */
>> +	b->exec_queue_id = q->id;
>> +	b->pf = pf;
>> +}
>> +
>> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
>> +				struct xe_exec_queue *q)
>> +{
>> +	struct blame *b, *tmp;
>> +
>> +	spin_lock(&client->blame_lock);
>> +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
>> +		if (b->exec_queue_id == q->id)
>> +			free_blame(b);
>> +	spin_unlock(&client->blame_lock);
>> +
>> +}
>> +
>>   static void bo_meminfo(struct xe_bo *bo,
>>   		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
>>   {
>> @@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>   	}
>>   }
>>   
>> +static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
>> +{
>> +	drm_printf(p, "\n\t\tASID: %d\n"
>> +	 "\t\tVFID: %d\n"
>> +	 "\t\tPDATA: 0x%04x\n"
>> +	 "\t\tFaulted Address: 0x%08x%08x\n"
>> +	 "\t\tFaultType: %s\n"
>> +	 "\t\tAccessType: %s\n"
>> +	 "\t\tFaultLevel: %d\n"
>> +	 "\t\tEngineClass: %d %s\n"
>> +	 "\t\tEngineInstance: %d\n",
>> +	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
>> +	 lower_32_bits(pf->page_addr),
>> +	 fault_type_to_str(pf->fault_type),
>> +	 access_type_to_str(pf->access_type),
>> +	 pf->fault_level, pf->engine_class,
>> +	 xe_hw_engine_class_to_str(pf->engine_class),
>> +	 pf->engine_instance);
>> +}
>> +
>> +static void show_blames(struct drm_printer *p, struct drm_file *file)
>> +{
>> +	struct xe_file *xef = file->driver_priv;
>> +	struct xe_drm_client *client;
>> +	struct blame *b;
>> +
>> +	client = xef->client;
>> +
>> +	drm_printf(p, "\n");
>> +	drm_printf(p, "- Exec queue ban list -\n");
>> +	spin_lock(&client->blame_lock);
>> +	list_for_each_entry(b, &client->blame_list, list) {
>> +		struct pagefault *pf = b->pf;
>> +		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
>> +		drm_printf(p, "\t\tAssociated pagefault:\n");
>> +		if (pf)
>> +			print_pagefault(p, pf);
>> +		else
>> +			drm_printf(p, "\t\t- No associated pagefault -\n");
>> +	}
>> +	spin_unlock(&client->blame_lock);
>> +}
>> +
>>   /**
>>    * xe_drm_client_fdinfo() - Callback for fdinfo interface
>>    * @p: The drm_printer ptr
>> @@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
>>   {
>>   	show_meminfo(p, file);
>>   	show_run_ticks(p, file);
>> +	show_blames(p, file);
>>   }
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
>> index a9649aa36011..d21fd0b90742 100644
>> --- a/drivers/gpu/drm/xe/xe_drm_client.h
>> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
>> @@ -15,7 +15,18 @@
>>   
>>   struct drm_file;
>>   struct drm_printer;
>> +struct pagefault;
>>   struct xe_bo;
>> +struct xe_exec_queue;
>> +
>> +struct blame {
>> +	/** @exec_queue_id: ID number of banned exec queue */
>> +	u32 exec_queue_id;
>> +	/** @pf: pagefault on engine of banned exec queue, if any at time */
>> +	struct pagefault *pf;
>> +	/** @list: link into @xe_drm_client.blame_list */
>> +	struct list_head list;
>> +};
>>   
>>   struct xe_drm_client {
>>   	struct kref kref;
>> @@ -31,6 +42,17 @@ struct xe_drm_client {
>>   	 * Protected by @bos_lock.
>>   	 */
>>   	struct list_head bos_list;
>> +	/**
>> +	 * @blame_lock: lock protecting @blame_list
>> +	 */
>> +	spinlock_t blame_lock;
>> +	/**
>> +	 * @blame_list: list of banned exec queues associated with this drm
>> +	 *		client, as well as any pagefaults at time of ban.
>> +	 *
>> +	 * Protected by @blame_lock;
>> +	 */
>> +	struct list_head blame_list;
>>   #endif
>>   };
>>   
>> @@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
>>   void xe_drm_client_add_bo(struct xe_drm_client *client,
>>   			  struct xe_bo *bo);
>>   void xe_drm_client_remove_bo(struct xe_bo *bo);
>> +void xe_drm_client_add_blame(struct xe_drm_client *client,
>> +			     struct xe_exec_queue *q);
>> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
>> +				struct xe_exec_queue *q);
>>   #else
>>   static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>>   					struct xe_bo *bo)
>> @@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
>>   static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
>>   {
>>   }
>> +
>> +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
>> +					   struct xe_exec_queue *q)
>> +{
>> +}
>> +
>> +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
>> +					      struct xe_exec_queue *q)
>> +{
>> +}
>>   #endif
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
>> index a02e62465e01..9c9bc617020c 100644
>> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
>> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
>> @@ -13,6 +13,7 @@
>>   #include <uapi/drm/xe_drm.h>
>>   
>>   #include "xe_device.h"
>> +#include "xe_drm_client.h"
>>   #include "xe_gt.h"
>>   #include "xe_hw_engine_class_sysfs.h"
>>   #include "xe_hw_engine_group.h"
>> @@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
>>   	q->id = id;
>>   	args->exec_queue_id = id;
>>   
>> +	/**
>> +	 * If an exec queue in the blame list shares the same exec queue
>> +	 * ID, remove it from the blame list to avoid confusion.
>> +	 */
>> +	xe_drm_client_remove_blame(q->xef->client, q);
>> +
>>   	return 0;
>>   
>>   kill_exec_queue:
>> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> index fe18e3ec488a..a0e6f2281e37 100644
>> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
>> @@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>   	return full ? -ENOSPC : 0;
>>   }
>>   
>> +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
>> +{
>> +	struct xe_hw_engine *hwe;
>> +
>> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
>> +	if (hwe) {
>> +		spin_lock(&hwe->pf.lock);
>> +		/** The latest pagefault is pf, so remove old pf info from engine */
>> +		if (hwe->pf.info)
>> +			kfree(hwe->pf.info);
>> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
>> +		if (hwe->pf.info)
>> +			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
>> +		spin_unlock(&hwe->pf.lock);
>> +	}
>> +}
>> +
>>   #define USM_QUEUE_MAX_RUNTIME_MS	20
>>   
>>   static void pf_queue_work_func(struct work_struct *w)
>> @@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
>>   			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
>>   		}
>>   
>> +		save_pagefault_to_engine(gt, &pf);
>> +
>>   		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
>>   			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
>>   			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>> index 913c74d6e2ae..d9da5c89429e 100644
>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>> @@ -20,11 +20,13 @@
>>   #include "xe_assert.h"
>>   #include "xe_devcoredump.h"
>>   #include "xe_device.h"
>> +#include "xe_drm_client.h"
>>   #include "xe_exec_queue.h"
>>   #include "xe_force_wake.h"
>>   #include "xe_gpu_scheduler.h"
>>   #include "xe_gt.h"
>>   #include "xe_gt_clock.h"
>> +#include "xe_gt_pagefault.h"
>>   #include "xe_gt_printk.h"
>>   #include "xe_guc.h"
>>   #include "xe_guc_capture.h"
>> @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
>>   static void set_exec_queue_banned(struct xe_exec_queue *q)
>>   {
>>   	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
>> +	xe_drm_client_add_blame(q->xef->client, q);
>>   }
>>   
>>   static bool exec_queue_suspended(struct xe_exec_queue *q)
>> @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>   int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>   {
>>   	struct xe_gt *gt = guc_to_gt(guc);
>> +	struct xe_hw_engine *hwe;
>>   	struct xe_exec_queue *q;
>>   	u32 guc_id;
>>   
>> @@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
>>   	if (unlikely(!q))
>>   		return -EPROTO;
>>   
>> +	hwe = q->hwe;
>> +
>>   	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
>>   		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
>>   
>>   	trace_xe_exec_queue_reset(q);
>>   
>> +	/**
>> +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
>> +	 * not caused by said pagefault now that the engine has reset.
>> +	 */
>> +	spin_lock(&hwe->pf.lock);
>> +	if (hwe->pf.info) {
>> +		kfree(hwe->pf.info);
>> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
>> +	}
>> +	spin_unlock(&hwe->pf.lock);
>> +
>>   	/*
>>   	 * A banned engine is a NOP at this point (came from
>>   	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
>> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
>> index fc447751fe78..69f61e4905e2 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
>> @@ -21,6 +21,7 @@
>>   #include "xe_gsc.h"
>>   #include "xe_gt.h"
>>   #include "xe_gt_ccs_mode.h"
>> +#include "xe_gt_pagefault.h"
>>   #include "xe_gt_printk.h"
>>   #include "xe_gt_mcr.h"
>>   #include "xe_gt_topology.h"
>> @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
>>   		hwe->eclass->defaults = hwe->eclass->sched_props;
>>   	}
>>   
>> +	hwe->pf.info = NULL;
>> +	spin_lock_init(&hwe->pf.lock);
>> +
>>   	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
>>   	xe_tuning_process_engine(hwe);
>>   	xe_wa_process_engine(hwe);
>> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
>> index e4191a7a2c31..2e1be9481d9b 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
>> @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
>>   struct xe_bo;
>>   struct xe_execlist_port;
>>   struct xe_gt;
>> +struct pagefault;
>>   
>>   /**
>>    * struct xe_hw_engine_class_intf - per hw engine class struct interface
>> @@ -150,6 +151,13 @@ struct xe_hw_engine {
>>   	struct xe_oa_unit *oa_unit;
>>   	/** @hw_engine_group: the group of hw engines this one belongs to */
>>   	struct xe_hw_engine_group *hw_engine_group;
>> +	/** @pf: the last pagefault seen on this engine */
>> +	struct {
>> +		/** @pf.info: info containing last seen pagefault details */
>> +		struct pagefault *info;
>> +		/** @pf.lock: lock protecting @pf.info */
>> +		spinlock_t lock;
>> +	} pf;
>>   };
>>   
>>   enum xe_hw_engine_snapshot_source_id {
>> -- 
>> 2.43.0
>>
>
Cavitt, Jonathan Feb. 18, 2025, 8:40 p.m. UTC | #5
-----Original Message-----
From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> 
Sent: Tuesday, February 18, 2025 10:39 AM
To: Simona Vetter <simona.vetter@ffwll.ch>; Cavitt, Jonathan <jonathan.cavitt@intel.com>
Cc: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Lahtinen, Joonas <joonas.lahtinen@intel.com>; De Marchi, Lucas <lucas.demarchi@intel.com>; Brost, Matthew <matthew.brost@intel.com>
Subject: Re: [PATCH 3/4] FIXME: drm/xe/xe_drm_client: Add per drm client pagefault info
> 
> On 17/02/2025 17:01, Simona Vetter wrote:
> > On Fri, Feb 14, 2025 at 08:37:56PM +0000, Jonathan Cavitt wrote:
> >> Add additional information to drm client so it can report the last 50
> >> exec queues to have been banned on it, as well as the last pagefault
> >> seen when said exec queues were banned.  Since we cannot reasonably
> >> associate a pagefault to a specific exec queue, we currently report the
> >> last seen pagefault on the associated hw engine instead.
> >>
> >> The last pagefault seen per exec queue is saved to the hw engine, and the
> >> pagefault is updated during the pagefault handling process in
> >> xe_gt_pagefault.  The last seen pagefault is reset when the engine is
> >> reset because any future exec queue bans likely were not caused by said
> >> pagefault after the reset.
> >>
> >> v2: Remove exec queue from blame list on destroy and recreate (Joonas)
> >>
> >> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> >> ---
> >>   drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
> >>   drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
> >>   drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
> >>   drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
> >>   drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
> >>   drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
> >>   drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
> >>   7 files changed, 219 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> >> index 2d4874d2b922..f15560d0b6ff 100644
> >> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> >> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> >> @@ -17,6 +17,7 @@
> >>   #include "xe_exec_queue.h"
> >>   #include "xe_force_wake.h"
> >>   #include "xe_gt.h"
> >> +#include "xe_gt_pagefault.h"
> >>   #include "xe_hw_engine.h"
> >>   #include "xe_pm.h"
> >>   #include "xe_trace.h"
> >> @@ -70,6 +71,21 @@
> >>    * 	drm-total-cycles-ccs:   7655183225
> >>    * 	drm-engine-capacity-ccs:        4
> >>    *
> >> + * 	- Exec queue ban list -
> > 
> > This looks like you're just totally tossing the drm fdinfo format and
> > going with something no one else can even parse.
> > 
> > I think it's time for some proper helper functions/macros and not
> > open-coded drm_printf for this stuff.
> > 
> > Also for sure needs an ack from Tvrtko.
> > 
> > Cheers, Sima
> > 
> >> + *
> >> + * 		Exec queue 1 banned:
> >> + * 			Associated pagefault:
> >> + * 			ASID: 9
> >> + * 			VFID: 0
> >> + * 			PDATA: 0x0450
> >> + * 			Faulted Address: 0x000001fff86a9000
> >> + * 			FaultType: NOT_PRESENT
> >> + * 			AccessType: ACCESS_TYPE_WRITE
> >> + * 			FaultLevel: 0
> >> + * 			EngineClass: 1 vcs
> >> + * 			EngineInstance: 0
> >> + *		
> >> + *
> >>    * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
> >>    * "other".
> 
>  From one angle one could say the new data is not drm fdinfo compliant 
> and so existing parsers will just skip over it and do nothing. Which is 
> not a problem per se.
> 
> So without going into any analysis of what data is exposed here and its 
> usefulness maybe drm fdinfo does not care.
> 
> Then from another angle, going slightly deeper into what is proposed 
> here, it _seems_ to be adding some sort of logging to fdinfo which is 
> way different than current single counters paradigm. I wonder if there 
> is precedent anywhere in the kernel for stuffing logs into fdinfo?
> 
> My first reaction would be that it feels a questionable approach (how 
> will userspace correlate "Exec queue *N*" to something?). I guess more 

The user is given an exec queue ID number when they call the exec queue
create ioctl, and that ID is reported back as "Exec queue *N*" here.

However, it was requested that this be converted into an xe_query query
(by Joonas), so this will no longer be reported through xe_drm_client_fdinfo.
Apologies for the confusion.
-Jonathan Cavitt

> info on how is this useful for userspace should be added to the patch 
> description.
> 
> Regards,
> 
> Tvrtko
> 
> >>    */
> >> @@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
> >>   #ifdef CONFIG_PROC_FS
> >>   	spin_lock_init(&client->bos_lock);
> >>   	INIT_LIST_HEAD(&client->bos_list);
> >> +	spin_lock_init(&client->blame_lock);
> >> +	INIT_LIST_HEAD(&client->blame_list);
> >>   #endif
> >>   	return client;
> >>   }
> >> @@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
> >>   	xe_drm_client_put(client);
> >>   }
> >>   
> >> +static void free_blame(struct blame *b)
> >> +{
> >> +	list_del(&b->list);
> >> +	kfree(b->pf);
> >> +	kfree(b);
> >> +}
> >> +
> >> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> >> +			     struct xe_exec_queue *q)
> >> +{
> >> +	struct blame *b = NULL;
> >> +	struct list_head *h;
> >> +	struct pagefault *pf = NULL;
> >> +	struct xe_file *xef = q->xef;
> >> +	struct xe_hw_engine *hwe = q->hwe;
> >> +	unsigned long count;
> >> +
> >> +	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
> >> +	xe_assert(xef->xe, b);
> >> +
> >> +	spin_lock(&client->blame_lock);
> >> +	list_add_tail(&b->list, &client->blame_list);
> >> +	/**
> >> +	 * Limit the number of blames in the blame list to prevent memory overuse.
> >> +	 *
> >> +	 * TODO: Parameterize max blame list size.
> >> +	 */
> >> +	count = 0;
> >> +	list_for_each(h, &client->blame_list)
> >> +		count++;
> >> +	if (count >= 50) {
> >> +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
> >> +		free_blame(rem);
> >> +	}
> >> +	spin_unlock(&client->blame_lock);
> >> +
> >> +	/**
> >> +	 * Duplicate pagefault on engine to blame, if one may have caused the
> >> +	 * exec queue to be banned.
> >> +	 */
> >> +	b->pf = NULL;
> >> +	spin_lock(&hwe->pf.lock);
> >> +	if (hwe->pf.info) {
> >> +		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> >> +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
> >> +	}
> >> +	spin_unlock(&hwe->pf.lock);
> >> +
> >> +	/** Save blame data to list element */
> >> +	b->exec_queue_id = q->id;
> >> +	b->pf = pf;
> >> +}
> >> +
> >> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> >> +				struct xe_exec_queue *q)
> >> +{
> >> +	struct blame *b, *tmp;
> >> +
> >> +	spin_lock(&client->blame_lock);
> >> +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
> >> +		if (b->exec_queue_id == q->id)
> >> +			free_blame(b);
> >> +	spin_unlock(&client->blame_lock);
> >> +
> >> +}
> >> +
> >>   static void bo_meminfo(struct xe_bo *bo,
> >>   		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
> >>   {
> >> @@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> >>   	}
> >>   }
> >>   
> >> +static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
> >> +{
> >> +	drm_printf(p, "\n\t\tASID: %d\n"
> >> +	 "\t\tVFID: %d\n"
> >> +	 "\t\tPDATA: 0x%04x\n"
> >> +	 "\t\tFaulted Address: 0x%08x%08x\n"
> >> +	 "\t\tFaultType: %s\n"
> >> +	 "\t\tAccessType: %s\n"
> >> +	 "\t\tFaultLevel: %d\n"
> >> +	 "\t\tEngineClass: %d %s\n"
> >> +	 "\t\tEngineInstance: %d\n",
> >> +	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
> >> +	 lower_32_bits(pf->page_addr),
> >> +	 fault_type_to_str(pf->fault_type),
> >> +	 access_type_to_str(pf->access_type),
> >> +	 pf->fault_level, pf->engine_class,
> >> +	 xe_hw_engine_class_to_str(pf->engine_class),
> >> +	 pf->engine_instance);
> >> +}
> >> +
> >> +static void show_blames(struct drm_printer *p, struct drm_file *file)
> >> +{
> >> +	struct xe_file *xef = file->driver_priv;
> >> +	struct xe_drm_client *client;
> >> +	struct blame *b;
> >> +
> >> +	client = xef->client;
> >> +
> >> +	drm_printf(p, "\n");
> >> +	drm_printf(p, "- Exec queue ban list -\n");
> >> +	spin_lock(&client->blame_lock);
> >> +	list_for_each_entry(b, &client->blame_list, list) {
> >> +		struct pagefault *pf = b->pf;
> >> +		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
> >> +		drm_printf(p, "\t\tAssociated pagefault:\n");
> >> +		if (pf)
> >> +			print_pagefault(p, pf);
> >> +		else
> >> +			drm_printf(p, "\t\t- No associated pagefault -\n");
> >> +	}
> >> +	spin_unlock(&client->blame_lock);
> >> +}
> >> +
> >>   /**
> >>    * xe_drm_client_fdinfo() - Callback for fdinfo interface
> >>    * @p: The drm_printer ptr
> >> @@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
> >>   {
> >>   	show_meminfo(p, file);
> >>   	show_run_ticks(p, file);
> >> +	show_blames(p, file);
> >>   }
> >>   #endif
> >> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> >> index a9649aa36011..d21fd0b90742 100644
> >> --- a/drivers/gpu/drm/xe/xe_drm_client.h
> >> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> >> @@ -15,7 +15,18 @@
> >>   
> >>   struct drm_file;
> >>   struct drm_printer;
> >> +struct pagefault;
> >>   struct xe_bo;
> >> +struct xe_exec_queue;
> >> +
> >> +struct blame {
> >> +	/** @exec_queue_id: ID number of banned exec queue */
> >> +	u32 exec_queue_id;
> >> +	/** @pf: pagefault on engine of banned exec queue, if any at time */
> >> +	struct pagefault *pf;
> >> +	/** @list: link into @xe_drm_client.blame_list */
> >> +	struct list_head list;
> >> +};
> >>   
> >>   struct xe_drm_client {
> >>   	struct kref kref;
> >> @@ -31,6 +42,17 @@ struct xe_drm_client {
> >>   	 * Protected by @bos_lock.
> >>   	 */
> >>   	struct list_head bos_list;
> >> +	/**
> >> +	 * @blame_lock: lock protecting @blame_list
> >> +	 */
> >> +	spinlock_t blame_lock;
> >> +	/**
> >> +	 * @blame_list: list of banned exec queues associated with this drm
> >> +	 *		client, as well as any pagefaults at time of ban.
> >> +	 *
> >> +	 * Protected by @blame_lock;
> >> +	 */
> >> +	struct list_head blame_list;
> >>   #endif
> >>   };
> >>   
> >> @@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
> >>   void xe_drm_client_add_bo(struct xe_drm_client *client,
> >>   			  struct xe_bo *bo);
> >>   void xe_drm_client_remove_bo(struct xe_bo *bo);
> >> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> >> +			     struct xe_exec_queue *q);
> >> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> >> +				struct xe_exec_queue *q);
> >>   #else
> >>   static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> >>   					struct xe_bo *bo)
> >> @@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> >>   static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
> >>   {
> >>   }
> >> +
> >> +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
> >> +					   struct xe_exec_queue *q)
> >> +{
> >> +}
> >> +
> >> +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
> >> +					      struct xe_exec_queue *q)
> >> +{
> >> +}
> >>   #endif
> >>   #endif
> >> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> >> index a02e62465e01..9c9bc617020c 100644
> >> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> >> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> >> @@ -13,6 +13,7 @@
> >>   #include <uapi/drm/xe_drm.h>
> >>   
> >>   #include "xe_device.h"
> >> +#include "xe_drm_client.h"
> >>   #include "xe_gt.h"
> >>   #include "xe_hw_engine_class_sysfs.h"
> >>   #include "xe_hw_engine_group.h"
> >> @@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> >>   	q->id = id;
> >>   	args->exec_queue_id = id;
> >>   
> >> +	/**
> >> +	 * If an exec queue in the blame list shares the same exec queue
> >> +	 * ID, remove it from the blame list to avoid confusion.
> >> +	 */
> >> +	xe_drm_client_remove_blame(q->xef->client, q);
> >> +
> >>   	return 0;
> >>   
> >>   kill_exec_queue:
> >> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> >> index fe18e3ec488a..a0e6f2281e37 100644
> >> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> >> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> >> @@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >>   	return full ? -ENOSPC : 0;
> >>   }
> >>   
> >> +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
> >> +{
> >> +	struct xe_hw_engine *hwe;
> >> +
> >> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> >> +	if (hwe) {
> >> +		spin_lock(&hwe->pf.lock);
> >> +		/** The latest pagefault is pf, so remove old pf info from engine */
> >> +		if (hwe->pf.info)
> >> +			kfree(hwe->pf.info);
> >> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> >> +		if (hwe->pf.info)
> >> +			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
> >> +		spin_unlock(&hwe->pf.lock);
> >> +	}
> >> +}
> >> +
> >>   #define USM_QUEUE_MAX_RUNTIME_MS	20
> >>   
> >>   static void pf_queue_work_func(struct work_struct *w)
> >> @@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
> >>   			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
> >>   		}
> >>   
> >> +		save_pagefault_to_engine(gt, &pf);
> >> +
> >>   		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
> >>   			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
> >>   			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> >> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> >> index 913c74d6e2ae..d9da5c89429e 100644
> >> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> >> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> >> @@ -20,11 +20,13 @@
> >>   #include "xe_assert.h"
> >>   #include "xe_devcoredump.h"
> >>   #include "xe_device.h"
> >> +#include "xe_drm_client.h"
> >>   #include "xe_exec_queue.h"
> >>   #include "xe_force_wake.h"
> >>   #include "xe_gpu_scheduler.h"
> >>   #include "xe_gt.h"
> >>   #include "xe_gt_clock.h"
> >> +#include "xe_gt_pagefault.h"
> >>   #include "xe_gt_printk.h"
> >>   #include "xe_guc.h"
> >>   #include "xe_guc_capture.h"
> >> @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
> >>   static void set_exec_queue_banned(struct xe_exec_queue *q)
> >>   {
> >>   	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
> >> +	xe_drm_client_add_blame(q->xef->client, q);
> >>   }
> >>   
> >>   static bool exec_queue_suspended(struct xe_exec_queue *q)
> >> @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >>   int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >>   {
> >>   	struct xe_gt *gt = guc_to_gt(guc);
> >> +	struct xe_hw_engine *hwe;
> >>   	struct xe_exec_queue *q;
> >>   	u32 guc_id;
> >>   
> >> @@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> >>   	if (unlikely(!q))
> >>   		return -EPROTO;
> >>   
> >> +	hwe = q->hwe;
> >> +
> >>   	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> >>   		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> >>   
> >>   	trace_xe_exec_queue_reset(q);
> >>   
> >> +	/**
> >> +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
> >> +	 * not caused by said pagefault now that the engine has reset.
> >> +	 */
> >> +	spin_lock(&hwe->pf.lock);
> >> +	if (hwe->pf.info) {
> >> +		kfree(hwe->pf.info);
> >> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> >> +	}
> >> +	spin_unlock(&hwe->pf.lock);
> >> +
> >>   	/*
> >>   	 * A banned engine is a NOP at this point (came from
> >>   	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
> >> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> >> index fc447751fe78..69f61e4905e2 100644
> >> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> >> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> >> @@ -21,6 +21,7 @@
> >>   #include "xe_gsc.h"
> >>   #include "xe_gt.h"
> >>   #include "xe_gt_ccs_mode.h"
> >> +#include "xe_gt_pagefault.h"
> >>   #include "xe_gt_printk.h"
> >>   #include "xe_gt_mcr.h"
> >>   #include "xe_gt_topology.h"
> >> @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
> >>   		hwe->eclass->defaults = hwe->eclass->sched_props;
> >>   	}
> >>   
> >> +	hwe->pf.info = NULL;
> >> +	spin_lock_init(&hwe->pf.lock);
> >> +
> >>   	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
> >>   	xe_tuning_process_engine(hwe);
> >>   	xe_wa_process_engine(hwe);
> >> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> >> index e4191a7a2c31..2e1be9481d9b 100644
> >> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> >> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> >> @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
> >>   struct xe_bo;
> >>   struct xe_execlist_port;
> >>   struct xe_gt;
> >> +struct pagefault;
> >>   
> >>   /**
> >>    * struct xe_hw_engine_class_intf - per hw engine class struct interface
> >> @@ -150,6 +151,13 @@ struct xe_hw_engine {
> >>   	struct xe_oa_unit *oa_unit;
> >>   	/** @hw_engine_group: the group of hw engines this one belongs to */
> >>   	struct xe_hw_engine_group *hw_engine_group;
> >> +	/** @pf: the last pagefault seen on this engine */
> >> +	struct {
> >> +		/** @pf.info: info containing last seen pagefault details */
> >> +		struct pagefault *info;
> >> +		/** @pf.lock: lock protecting @pf.info */
> >> +		spinlock_t lock;
> >> +	} pf;
> >>   };
> >>   
> >>   enum xe_hw_engine_snapshot_source_id {
> >> -- 
> >> 2.43.0
> >>
> > 
> 
>
Simona Vetter Feb. 19, 2025, 1:44 p.m. UTC | #6
On Tue, Feb 18, 2025 at 08:40:08PM +0000, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> 
> Sent: Tuesday, February 18, 2025 10:39 AM
> To: Simona Vetter <simona.vetter@ffwll.ch>; Cavitt, Jonathan <jonathan.cavitt@intel.com>
> Cc: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org; Gupta, saurabhg <saurabhg.gupta@intel.com>; Zuo, Alex <alex.zuo@intel.com>; Lahtinen, Joonas <joonas.lahtinen@intel.com>; De Marchi, Lucas <lucas.demarchi@intel.com>; Brost, Matthew <matthew.brost@intel.com>
> Subject: Re: [PATCH 3/4] FIXME: drm/xe/xe_drm_client: Add per drm client pagefault info
> > 
> > On 17/02/2025 17:01, Simona Vetter wrote:
> > > On Fri, Feb 14, 2025 at 08:37:56PM +0000, Jonathan Cavitt wrote:
> > >> Add additional information to drm client so it can report the last 50
> > >> exec queues to have been banned on it, as well as the last pagefault
> > >> seen when said exec queues were banned.  Since we cannot reasonably
> > >> associate a pagefault to a specific exec queue, we currently report the
> > >> last seen pagefault on the associated hw engine instead.
> > >>
> > >> The last pagefault seen per exec queue is saved to the hw engine, and the
> > >> pagefault is updated during the pagefault handling process in
> > >> xe_gt_pagefault.  The last seen pagefault is reset when the engine is
> > >> reset because any future exec queue bans likely were not caused by said
> > >> pagefault after the reset.
> > >>
> > >> v2: Remove exec queue from blame list on destroy and recreate (Joonas)
> > >>
> > >> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > >> ---
> > >>   drivers/gpu/drm/xe/xe_drm_client.c      | 128 ++++++++++++++++++++++++
> > >>   drivers/gpu/drm/xe/xe_drm_client.h      |  36 +++++++
> > >>   drivers/gpu/drm/xe/xe_exec_queue.c      |   7 ++
> > >>   drivers/gpu/drm/xe/xe_gt_pagefault.c    |  19 ++++
> > >>   drivers/gpu/drm/xe/xe_guc_submit.c      |  17 ++++
> > >>   drivers/gpu/drm/xe/xe_hw_engine.c       |   4 +
> > >>   drivers/gpu/drm/xe/xe_hw_engine_types.h |   8 ++
> > >>   7 files changed, 219 insertions(+)
> > >>
> > >> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> > >> index 2d4874d2b922..f15560d0b6ff 100644
> > >> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> > >> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> > >> @@ -17,6 +17,7 @@
> > >>   #include "xe_exec_queue.h"
> > >>   #include "xe_force_wake.h"
> > >>   #include "xe_gt.h"
> > >> +#include "xe_gt_pagefault.h"
> > >>   #include "xe_hw_engine.h"
> > >>   #include "xe_pm.h"
> > >>   #include "xe_trace.h"
> > >> @@ -70,6 +71,21 @@
> > >>    * 	drm-total-cycles-ccs:   7655183225
> > >>    * 	drm-engine-capacity-ccs:        4
> > >>    *
> > >> + * 	- Exec queue ban list -
> > > 
> > > This looks like you're just totally tossing the drm fdinfo format and
> > > going with something no one else can even parse.
> > > 
> > > I think it's time for some proper helper functions/macros and not
> > > open-coded drm_printf for this stuff.
> > > 
> > > Also for sure needs an ack from Tvrtko.
> > > 
> > > Cheers, Sima
> > > 
> > >> + *
> > >> + * 		Exec queue 1 banned:
> > >> + * 			Associated pagefault:
> > >> + * 			ASID: 9
> > >> + * 			VFID: 0
> > >> + * 			PDATA: 0x0450
> > >> + * 			Faulted Address: 0x000001fff86a9000
> > >> + * 			FaultType: NOT_PRESENT
> > >> + * 			AccessType: ACCESS_TYPE_WRITE
> > >> + * 			FaultLevel: 0
> > >> + * 			EngineClass: 1 vcs
> > >> + * 			EngineInstance: 0
> > >> + *		
> > >> + *
> > >>    * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
> > >>    * "other".
> > 
> >  From one angle one could say the new data is not drm fdinfo compliant 
> > and so existing parsers will just skip over it and do nothing. Which is 
> > not a problem per se.
> > 
> > So without going into any analysis of what data is exposed here and its 
> > usefulness maybe drm fdinfo does not care.
> > 
> > Then from another angle, going slightly deeper into what is proposed 
> > here, it _seems_ to be adding some sort of logging to fdinfo which is 
> > way different than current single counters paradigm. I wonder if there 
> > is precedent anywhere in the kernel for stuffing logs into fdinfo?
> > 
> > My first reaction would be that it feels a questionable approach (how 
> > will userspace correlate "Exec queue *N*" to something?). I guess more 
> 
> The user is given an exec queue ID number when they call the exec queue
> create ioctl, and that ID is reported back as "Exec queue *N*" here.
> 
> However, it was requested that this be converted into an xe_query query
> (by Joonas), so this will no longer be reported through xe_drm_client_fdinfo.
> Apologies for the confusion.

Yeah this sounds very much like query ioctl material, or maybe tracepoint
material. I didn't even look into the semantics, just noticed the very
drm_fdinfo-unlike formatting.
-Sima

> -Jonathan Cavitt
> 
> > info on how is this useful for userspace should be added to the patch 
> > description.
> > 
> > Regards,
> > 
> > Tvrtko
> > 
> > >>    */
> > >> @@ -97,6 +113,8 @@ struct xe_drm_client *xe_drm_client_alloc(void)
> > >>   #ifdef CONFIG_PROC_FS
> > >>   	spin_lock_init(&client->bos_lock);
> > >>   	INIT_LIST_HEAD(&client->bos_list);
> > >> +	spin_lock_init(&client->blame_lock);
> > >> +	INIT_LIST_HEAD(&client->blame_list);
> > >>   #endif
> > >>   	return client;
> > >>   }
> > >> @@ -164,6 +182,72 @@ void xe_drm_client_remove_bo(struct xe_bo *bo)
> > >>   	xe_drm_client_put(client);
> > >>   }
> > >>   
> > >> +static void free_blame(struct blame *b)
> > >> +{
> > >> +	list_del(&b->list);
> > >> +	kfree(b->pf);
> > >> +	kfree(b);
> > >> +}
> > >> +
> > >> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> > >> +			     struct xe_exec_queue *q)
> > >> +{
> > >> +	struct blame *b = NULL;
> > >> +	struct list_head *h;
> > >> +	struct pagefault *pf = NULL;
> > >> +	struct xe_file *xef = q->xef;
> > >> +	struct xe_hw_engine *hwe = q->hwe;
> > >> +	unsigned long count;
> > >> +
> > >> +	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
> > >> +	xe_assert(xef->xe, b);
> > >> +
> > >> +	spin_lock(&client->blame_lock);
> > >> +	list_add_tail(&b->list, &client->blame_list);
> > >> +	/**
> > >> +	 * Limit the number of blames in the blame list to prevent memory overuse.
> > >> +	 *
> > >> +	 * TODO: Parameterize max blame list size.
> > >> +	 */
> > >> +	count = 0;
> > >> +	list_for_each(h, &client->blame_list)
> > >> +		count++;
> > >> +	if (count >= 50) {
> > >> +		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
> > >> +		free_blame(rem);
> > >> +	}
> > >> +	spin_unlock(&client->blame_lock);
> > >> +
> > >> +	/**
> > >> +	 * Duplicate pagefault on engine to blame, if one may have caused the
> > >> +	 * exec queue to be banned.
> > >> +	 */
> > >> +	b->pf = NULL;
> > >> +	spin_lock(&hwe->pf.lock);
> > >> +	if (hwe->pf.info) {
> > >> +		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > >> +		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
> > >> +	}
> > >> +	spin_unlock(&hwe->pf.lock);
> > >> +
> > >> +	/** Save blame data to list element */
> > >> +	b->exec_queue_id = q->id;
> > >> +	b->pf = pf;
> > >> +}
> > >> +
> > >> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > >> +				struct xe_exec_queue *q)
> > >> +{
> > >> +	struct blame *b, *tmp;
> > >> +
> > >> +	spin_lock(&client->blame_lock);
> > >> +	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
> > >> +		if (b->exec_queue_id == q->id)
> > >> +			free_blame(b);
> > >> +	spin_unlock(&client->blame_lock);
> > >> +
> > >> +}
> > >> +
> > >>   static void bo_meminfo(struct xe_bo *bo,
> > >>   		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
> > >>   {
> > >> @@ -380,6 +464,49 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> > >>   	}
> > >>   }
> > >>   
> > >> +static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
> > >> +{
> > >> +	drm_printf(p, "\n\t\tASID: %d\n"
> > >> +	 "\t\tVFID: %d\n"
> > >> +	 "\t\tPDATA: 0x%04x\n"
> > >> +	 "\t\tFaulted Address: 0x%08x%08x\n"
> > >> +	 "\t\tFaultType: %s\n"
> > >> +	 "\t\tAccessType: %s\n"
> > >> +	 "\t\tFaultLevel: %d\n"
> > >> +	 "\t\tEngineClass: %d %s\n"
> > >> +	 "\t\tEngineInstance: %d\n",
> > >> +	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
> > >> +	 lower_32_bits(pf->page_addr),
> > >> +	 fault_type_to_str(pf->fault_type),
> > >> +	 access_type_to_str(pf->access_type),
> > >> +	 pf->fault_level, pf->engine_class,
> > >> +	 xe_hw_engine_class_to_str(pf->engine_class),
> > >> +	 pf->engine_instance);
> > >> +}
> > >> +
> > >> +static void show_blames(struct drm_printer *p, struct drm_file *file)
> > >> +{
> > >> +	struct xe_file *xef = file->driver_priv;
> > >> +	struct xe_drm_client *client;
> > >> +	struct blame *b;
> > >> +
> > >> +	client = xef->client;
> > >> +
> > >> +	drm_printf(p, "\n");
> > >> +	drm_printf(p, "- Exec queue ban list -\n");
> > >> +	spin_lock(&client->blame_lock);
> > >> +	list_for_each_entry(b, &client->blame_list, list) {
> > >> +		struct pagefault *pf = b->pf;
> > >> +		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
> > >> +		drm_printf(p, "\t\tAssociated pagefault:\n");
> > >> +		if (pf)
> > >> +			print_pagefault(p, pf);
> > >> +		else
> > >> +			drm_printf(p, "\t\t- No associated pagefault -\n");
> > >> +	}
> > >> +	spin_unlock(&client->blame_lock);
> > >> +}
> > >> +
> > >>   /**
> > >>    * xe_drm_client_fdinfo() - Callback for fdinfo interface
> > >>    * @p: The drm_printer ptr
> > >> @@ -394,5 +521,6 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
> > >>   {
> > >>   	show_meminfo(p, file);
> > >>   	show_run_ticks(p, file);
> > >> +	show_blames(p, file);
> > >>   }
> > >>   #endif
> > >> diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
> > >> index a9649aa36011..d21fd0b90742 100644
> > >> --- a/drivers/gpu/drm/xe/xe_drm_client.h
> > >> +++ b/drivers/gpu/drm/xe/xe_drm_client.h
> > >> @@ -15,7 +15,18 @@
> > >>   
> > >>   struct drm_file;
> > >>   struct drm_printer;
> > >> +struct pagefault;
> > >>   struct xe_bo;
> > >> +struct xe_exec_queue;
> > >> +
> > >> +struct blame {
> > >> +	/** @exec_queue_id: ID number of banned exec queue */
> > >> +	u32 exec_queue_id;
> > >> +	/** @pf: pagefault on engine of banned exec queue, if any at time */
> > >> +	struct pagefault *pf;
> > >> +	/** @list: link into @xe_drm_client.blame_list */
> > >> +	struct list_head list;
> > >> +};
> > >>   
> > >>   struct xe_drm_client {
> > >>   	struct kref kref;
> > >> @@ -31,6 +42,17 @@ struct xe_drm_client {
> > >>   	 * Protected by @bos_lock.
> > >>   	 */
> > >>   	struct list_head bos_list;
> > >> +	/**
> > >> +	 * @blame_lock: lock protecting @blame_list
> > >> +	 */
> > >> +	spinlock_t blame_lock;
> > >> +	/**
> > >> +	 * @blame_list: list of banned exec queues associated with this drm
> > >> +	 *		client, as well as any pagefaults at time of ban.
> > >> +	 *
> > >> +	 * Protected by @blame_lock;
> > >> +	 */
> > >> +	struct list_head blame_list;
> > >>   #endif
> > >>   };
> > >>   
> > >> @@ -57,6 +79,10 @@ void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
> > >>   void xe_drm_client_add_bo(struct xe_drm_client *client,
> > >>   			  struct xe_bo *bo);
> > >>   void xe_drm_client_remove_bo(struct xe_bo *bo);
> > >> +void xe_drm_client_add_blame(struct xe_drm_client *client,
> > >> +			     struct xe_exec_queue *q);
> > >> +void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > >> +				struct xe_exec_queue *q);
> > >>   #else
> > >>   static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> > >>   					struct xe_bo *bo)
> > >> @@ -66,5 +92,15 @@ static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
> > >>   static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
> > >>   {
> > >>   }
> > >> +
> > >> +static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
> > >> +					   struct xe_exec_queue *q)
> > >> +{
> > >> +}
> > >> +
> > >> +static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
> > >> +					      struct xe_exec_queue *q)
> > >> +{
> > >> +}
> > >>   #endif
> > >>   #endif
> > >> diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > >> index a02e62465e01..9c9bc617020c 100644
> > >> --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > >> +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > >> @@ -13,6 +13,7 @@
> > >>   #include <uapi/drm/xe_drm.h>
> > >>   
> > >>   #include "xe_device.h"
> > >> +#include "xe_drm_client.h"
> > >>   #include "xe_gt.h"
> > >>   #include "xe_hw_engine_class_sysfs.h"
> > >>   #include "xe_hw_engine_group.h"
> > >> @@ -714,6 +715,12 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
> > >>   	q->id = id;
> > >>   	args->exec_queue_id = id;
> > >>   
> > >> +	/**
> > >> +	 * If an exec queue in the blame list shares the same exec queue
> > >> +	 * ID, remove it from the blame list to avoid confusion.
> > >> +	 */
> > >> +	xe_drm_client_remove_blame(q->xef->client, q);
> > >> +
> > >>   	return 0;
> > >>   
> > >>   kill_exec_queue:
> > >> diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > >> index fe18e3ec488a..a0e6f2281e37 100644
> > >> --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > >> +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
> > >> @@ -330,6 +330,23 @@ int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > >>   	return full ? -ENOSPC : 0;
> > >>   }
> > >>   
> > >> +static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
> > >> +{
> > >> +	struct xe_hw_engine *hwe;
> > >> +
> > >> +	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
> > >> +	if (hwe) {
> > >> +		spin_lock(&hwe->pf.lock);
> > >> +		/** The latest pagefault is pf, so remove old pf info from engine */
> > >> +		if (hwe->pf.info)
> > >> +			kfree(hwe->pf.info);
> > >> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > >> +		if (hwe->pf.info)
> > >> +			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
> > >> +		spin_unlock(&hwe->pf.lock);
> > >> +	}
> > >> +}
> > >> +
> > >>   #define USM_QUEUE_MAX_RUNTIME_MS	20
> > >>   
> > >>   static void pf_queue_work_func(struct work_struct *w)
> > >> @@ -352,6 +369,8 @@ static void pf_queue_work_func(struct work_struct *w)
> > >>   			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
> > >>   		}
> > >>   
> > >> +		save_pagefault_to_engine(gt, &pf);
> > >> +
> > >>   		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
> > >>   			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
> > >>   			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
> > >> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > >> index 913c74d6e2ae..d9da5c89429e 100644
> > >> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > >> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > >> @@ -20,11 +20,13 @@
> > >>   #include "xe_assert.h"
> > >>   #include "xe_devcoredump.h"
> > >>   #include "xe_device.h"
> > >> +#include "xe_drm_client.h"
> > >>   #include "xe_exec_queue.h"
> > >>   #include "xe_force_wake.h"
> > >>   #include "xe_gpu_scheduler.h"
> > >>   #include "xe_gt.h"
> > >>   #include "xe_gt_clock.h"
> > >> +#include "xe_gt_pagefault.h"
> > >>   #include "xe_gt_printk.h"
> > >>   #include "xe_guc.h"
> > >>   #include "xe_guc_capture.h"
> > >> @@ -146,6 +148,7 @@ static bool exec_queue_banned(struct xe_exec_queue *q)
> > >>   static void set_exec_queue_banned(struct xe_exec_queue *q)
> > >>   {
> > >>   	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
> > >> +	xe_drm_client_add_blame(q->xef->client, q);
> > >>   }
> > >>   
> > >>   static bool exec_queue_suspended(struct xe_exec_queue *q)
> > >> @@ -1971,6 +1974,7 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > >>   int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > >>   {
> > >>   	struct xe_gt *gt = guc_to_gt(guc);
> > >> +	struct xe_hw_engine *hwe;
> > >>   	struct xe_exec_queue *q;
> > >>   	u32 guc_id;
> > >>   
> > >> @@ -1983,11 +1987,24 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
> > >>   	if (unlikely(!q))
> > >>   		return -EPROTO;
> > >>   
> > >> +	hwe = q->hwe;
> > >> +
> > >>   	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
> > >>   		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
> > >>   
> > >>   	trace_xe_exec_queue_reset(q);
> > >>   
> > >> +	/**
> > >> +	 * Clear last pagefault from engine.  Any future exec queue bans likely were
> > >> +	 * not caused by said pagefault now that the engine has reset.
> > >> +	 */
> > >> +	spin_lock(&hwe->pf.lock);
> > >> +	if (hwe->pf.info) {
> > >> +		kfree(hwe->pf.info);
> > >> +		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
> > >> +	}
> > >> +	spin_unlock(&hwe->pf.lock);
> > >> +
> > >>   	/*
> > >>   	 * A banned engine is a NOP at this point (came from
> > >>   	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
> > >> diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
> > >> index fc447751fe78..69f61e4905e2 100644
> > >> --- a/drivers/gpu/drm/xe/xe_hw_engine.c
> > >> +++ b/drivers/gpu/drm/xe/xe_hw_engine.c
> > >> @@ -21,6 +21,7 @@
> > >>   #include "xe_gsc.h"
> > >>   #include "xe_gt.h"
> > >>   #include "xe_gt_ccs_mode.h"
> > >> +#include "xe_gt_pagefault.h"
> > >>   #include "xe_gt_printk.h"
> > >>   #include "xe_gt_mcr.h"
> > >>   #include "xe_gt_topology.h"
> > >> @@ -557,6 +558,9 @@ static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
> > >>   		hwe->eclass->defaults = hwe->eclass->sched_props;
> > >>   	}
> > >>   
> > >> +	hwe->pf.info = NULL;
> > >> +	spin_lock_init(&hwe->pf.lock);
> > >> +
> > >>   	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
> > >>   	xe_tuning_process_engine(hwe);
> > >>   	xe_wa_process_engine(hwe);
> > >> diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > >> index e4191a7a2c31..2e1be9481d9b 100644
> > >> --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > >> +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
> > >> @@ -64,6 +64,7 @@ enum xe_hw_engine_id {
> > >>   struct xe_bo;
> > >>   struct xe_execlist_port;
> > >>   struct xe_gt;
> > >> +struct pagefault;
> > >>   
> > >>   /**
> > >>    * struct xe_hw_engine_class_intf - per hw engine class struct interface
> > >> @@ -150,6 +151,13 @@ struct xe_hw_engine {
> > >>   	struct xe_oa_unit *oa_unit;
> > >>   	/** @hw_engine_group: the group of hw engines this one belongs to */
> > >>   	struct xe_hw_engine_group *hw_engine_group;
> > >> +	/** @pf: the last pagefault seen on this engine */
> > >> +	struct {
> > >> +		/** @pf.info: info containing last seen pagefault details */
> > >> +		struct pagefault *info;
> > >> +		/** @pf.lock: lock protecting @pf.info */
> > >> +		spinlock_t lock;
> > >> +	} pf;
> > >>   };
> > >>   
> > >>   enum xe_hw_engine_snapshot_source_id {
> > >> -- 
> > >> 2.43.0
> > >>
> > > 
> > 
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
index 2d4874d2b922..f15560d0b6ff 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.c
+++ b/drivers/gpu/drm/xe/xe_drm_client.c
@@ -17,6 +17,7 @@ 
 #include "xe_exec_queue.h"
 #include "xe_force_wake.h"
 #include "xe_gt.h"
+#include "xe_gt_pagefault.h"
 #include "xe_hw_engine.h"
 #include "xe_pm.h"
 #include "xe_trace.h"
@@ -70,6 +71,21 @@ 
  * 	drm-total-cycles-ccs:   7655183225
  * 	drm-engine-capacity-ccs:        4
  *
+ * 	- Exec queue ban list -
+ *
+ * 		Exec queue 1 banned:
+ * 			Associated pagefault:
+ * 			ASID: 9
+ * 			VFID: 0
+ * 			PDATA: 0x0450
+ * 			Faulted Address: 0x000001fff86a9000
+ * 			FaultType: NOT_PRESENT
+ * 			AccessType: ACCESS_TYPE_WRITE
+ * 			FaultLevel: 0
+ * 			EngineClass: 1 vcs
+ * 			EngineInstance: 0
+ *		
+ *
  * Possible `drm-cycles-` key names are: `rcs`, `ccs`, `bcs`, `vcs`, `vecs` and
  * "other".
  */
@@ -97,6 +113,8 @@  struct xe_drm_client *xe_drm_client_alloc(void)
 #ifdef CONFIG_PROC_FS
 	spin_lock_init(&client->bos_lock);
 	INIT_LIST_HEAD(&client->bos_list);
+	spin_lock_init(&client->blame_lock);
+	INIT_LIST_HEAD(&client->blame_list);
 #endif
 	return client;
 }
@@ -164,6 +182,72 @@  void xe_drm_client_remove_bo(struct xe_bo *bo)
 	xe_drm_client_put(client);
 }
 
+static void free_blame(struct blame *b)
+{
+	list_del(&b->list);
+	kfree(b->pf);
+	kfree(b);
+}
+
+void xe_drm_client_add_blame(struct xe_drm_client *client,
+			     struct xe_exec_queue *q)
+{
+	struct blame *b = NULL;
+	struct list_head *h;
+	struct pagefault *pf = NULL;
+	struct xe_file *xef = q->xef;
+	struct xe_hw_engine *hwe = q->hwe;
+	unsigned long count;
+
+	b = kzalloc(sizeof(struct blame), GFP_KERNEL);
+	xe_assert(xef->xe, b);
+
+	spin_lock(&client->blame_lock);
+	list_add_tail(&b->list, &client->blame_list);
+	/**
+	 * Limit the number of blames in the blame list to prevent memory overuse.
+	 *
+	 * TODO: Parameterize max blame list size.
+	 */
+	count = 0;
+	list_for_each(h, &client->blame_list)
+		count++;
+	if (count >= 50) {
+		struct blame *rem = list_first_entry(&client->blame_list, struct blame, list);
+		free_blame(rem);
+	}
+	spin_unlock(&client->blame_lock);
+
+	/**
+	 * Duplicate pagefault on engine to blame, if one may have caused the
+	 * exec queue to be banned.
+	 */
+	b->pf = NULL;
+	spin_lock(&hwe->pf.lock);
+	if (hwe->pf.info) {
+		pf = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+		memcpy(pf, hwe->pf.info, sizeof(struct pagefault));
+	}
+	spin_unlock(&hwe->pf.lock);
+
+	/** Save blame data to list element */
+	b->exec_queue_id = q->id;
+	b->pf = pf;
+}
+
+void xe_drm_client_remove_blame(struct xe_drm_client *client,
+				struct xe_exec_queue *q)
+{
+	struct blame *b, *tmp;
+
+	spin_lock(&client->blame_lock);
+	list_for_each_entry_safe(b, tmp, &client->blame_list, list)
+		if (b->exec_queue_id == q->id)
+			free_blame(b);
+	spin_unlock(&client->blame_lock);
+
+}
+
 static void bo_meminfo(struct xe_bo *bo,
 		       struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
 {
@@ -380,6 +464,49 @@  static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
 	}
 }
 
+static void print_pagefault(struct drm_printer *p, struct pagefault *pf)
+{
+	drm_printf(p, "\n\t\tASID: %d\n"
+	 "\t\tVFID: %d\n"
+	 "\t\tPDATA: 0x%04x\n"
+	 "\t\tFaulted Address: 0x%08x%08x\n"
+	 "\t\tFaultType: %s\n"
+	 "\t\tAccessType: %s\n"
+	 "\t\tFaultLevel: %d\n"
+	 "\t\tEngineClass: %d %s\n"
+	 "\t\tEngineInstance: %d\n",
+	 pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr),
+	 lower_32_bits(pf->page_addr),
+	 fault_type_to_str(pf->fault_type),
+	 access_type_to_str(pf->access_type),
+	 pf->fault_level, pf->engine_class,
+	 xe_hw_engine_class_to_str(pf->engine_class),
+	 pf->engine_instance);
+}
+
+static void show_blames(struct drm_printer *p, struct drm_file *file)
+{
+	struct xe_file *xef = file->driver_priv;
+	struct xe_drm_client *client;
+	struct blame *b;
+
+	client = xef->client;
+
+	drm_printf(p, "\n");
+	drm_printf(p, "- Exec queue ban list -\n");
+	spin_lock(&client->blame_lock);
+	list_for_each_entry(b, &client->blame_list, list) {
+		struct pagefault *pf = b->pf;
+		drm_printf(p, "\n\tExec queue %u banned:\n", b->exec_queue_id);
+		drm_printf(p, "\t\tAssociated pagefault:\n");
+		if (pf)
+			print_pagefault(p, pf);
+		else
+			drm_printf(p, "\t\t- No associated pagefault -\n");
+	}
+	spin_unlock(&client->blame_lock);
+}
+
 /**
  * xe_drm_client_fdinfo() - Callback for fdinfo interface
  * @p: The drm_printer ptr
@@ -394,5 +521,6 @@  void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
 {
 	show_meminfo(p, file);
 	show_run_ticks(p, file);
+	show_blames(p, file);
 }
 #endif
diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h
index a9649aa36011..d21fd0b90742 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.h
+++ b/drivers/gpu/drm/xe/xe_drm_client.h
@@ -15,7 +15,18 @@ 
 
 struct drm_file;
 struct drm_printer;
+struct pagefault;
 struct xe_bo;
+struct xe_exec_queue;
+
+struct blame {
+	/** @exec_queue_id: ID number of banned exec queue */
+	u32 exec_queue_id;
+	/** @pf: pagefault on engine of banned exec queue, if any at time */
+	struct pagefault *pf;
+	/** @list: link into @xe_drm_client.blame_list */
+	struct list_head list;
+};
 
 struct xe_drm_client {
 	struct kref kref;
@@ -31,6 +42,17 @@  struct xe_drm_client {
 	 * Protected by @bos_lock.
 	 */
 	struct list_head bos_list;
+	/**
+	 * @blame_lock: lock protecting @blame_list
+	 */
+	spinlock_t blame_lock;
+	/**
+	 * @blame_list: list of banned exec queues associated with this drm
+	 *		client, as well as any pagefaults at time of ban.
+	 *
+	 * Protected by @blame_lock;
+	 */
+	struct list_head blame_list;
 #endif
 };
 
@@ -57,6 +79,10 @@  void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file);
 void xe_drm_client_add_bo(struct xe_drm_client *client,
 			  struct xe_bo *bo);
 void xe_drm_client_remove_bo(struct xe_bo *bo);
+void xe_drm_client_add_blame(struct xe_drm_client *client,
+			     struct xe_exec_queue *q);
+void xe_drm_client_remove_blame(struct xe_drm_client *client,
+				struct xe_exec_queue *q);
 #else
 static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
 					struct xe_bo *bo)
@@ -66,5 +92,15 @@  static inline void xe_drm_client_add_bo(struct xe_drm_client *client,
 static inline void xe_drm_client_remove_bo(struct xe_bo *bo)
 {
 }
+
+static inline void xe_drm_client_add_blame(struct xe_drm_client *client,
+					   struct xe_exec_queue *q)
+{
+}
+
+static inline void xe_drm_client_remove_blame(struct xe_drm_client *client,
+					      struct xe_exec_queue *q)
+{
+}
 #endif
 #endif
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index a02e62465e01..9c9bc617020c 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -13,6 +13,7 @@ 
 #include <uapi/drm/xe_drm.h>
 
 #include "xe_device.h"
+#include "xe_drm_client.h"
 #include "xe_gt.h"
 #include "xe_hw_engine_class_sysfs.h"
 #include "xe_hw_engine_group.h"
@@ -714,6 +715,12 @@  int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
 	q->id = id;
 	args->exec_queue_id = id;
 
+	/**
+	 * If an exec queue in the blame list shares the same exec queue
+	 * ID, remove it from the blame list to avoid confusion.
+	 */
+	xe_drm_client_remove_blame(q->xef->client, q);
+
 	return 0;
 
 kill_exec_queue:
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index fe18e3ec488a..a0e6f2281e37 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -330,6 +330,23 @@  int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len)
 	return full ? -ENOSPC : 0;
 }
 
+static void save_pagefault_to_engine(struct xe_gt *gt, struct pagefault *pf)
+{
+	struct xe_hw_engine *hwe;
+
+	hwe = xe_gt_hw_engine(gt, pf->engine_class, pf->engine_instance, false);
+	if (hwe) {
+		spin_lock(&hwe->pf.lock);
+		/** The latest pagefault is pf, so remove old pf info from engine */
+		if (hwe->pf.info)
+			kfree(hwe->pf.info);
+		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+		if (hwe->pf.info)
+			memcpy(hwe->pf.info, pf, sizeof(struct pagefault));
+		spin_unlock(&hwe->pf.lock);
+	}
+}
+
 #define USM_QUEUE_MAX_RUNTIME_MS	20
 
 static void pf_queue_work_func(struct work_struct *w)
@@ -352,6 +369,8 @@  static void pf_queue_work_func(struct work_struct *w)
 			drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret);
 		}
 
+		save_pagefault_to_engine(gt, &pf);
+
 		reply.dw0 = FIELD_PREP(PFR_VALID, 1) |
 			FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) |
 			FIELD_PREP(PFR_REPLY, PFR_ACCESS) |
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 913c74d6e2ae..d9da5c89429e 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -20,11 +20,13 @@ 
 #include "xe_assert.h"
 #include "xe_devcoredump.h"
 #include "xe_device.h"
+#include "xe_drm_client.h"
 #include "xe_exec_queue.h"
 #include "xe_force_wake.h"
 #include "xe_gpu_scheduler.h"
 #include "xe_gt.h"
 #include "xe_gt_clock.h"
+#include "xe_gt_pagefault.h"
 #include "xe_gt_printk.h"
 #include "xe_guc.h"
 #include "xe_guc_capture.h"
@@ -146,6 +148,7 @@  static bool exec_queue_banned(struct xe_exec_queue *q)
 static void set_exec_queue_banned(struct xe_exec_queue *q)
 {
 	atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state);
+	xe_drm_client_add_blame(q->xef->client, q);
 }
 
 static bool exec_queue_suspended(struct xe_exec_queue *q)
@@ -1971,6 +1974,7 @@  int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
 int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
 {
 	struct xe_gt *gt = guc_to_gt(guc);
+	struct xe_hw_engine *hwe;
 	struct xe_exec_queue *q;
 	u32 guc_id;
 
@@ -1983,11 +1987,24 @@  int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len)
 	if (unlikely(!q))
 		return -EPROTO;
 
+	hwe = q->hwe;
+
 	xe_gt_info(gt, "Engine reset: engine_class=%s, logical_mask: 0x%x, guc_id=%d",
 		   xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id);
 
 	trace_xe_exec_queue_reset(q);
 
+	/**
+	 * Clear last pagefault from engine.  Any future exec queue bans likely were
+	 * not caused by said pagefault now that the engine has reset.
+	 */
+	spin_lock(&hwe->pf.lock);
+	if (hwe->pf.info) {
+		kfree(hwe->pf.info);
+		hwe->pf.info = kzalloc(sizeof(struct pagefault), GFP_KERNEL);
+	}
+	spin_unlock(&hwe->pf.lock);
+
 	/*
 	 * A banned engine is a NOP at this point (came from
 	 * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel
diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c
index fc447751fe78..69f61e4905e2 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine.c
+++ b/drivers/gpu/drm/xe/xe_hw_engine.c
@@ -21,6 +21,7 @@ 
 #include "xe_gsc.h"
 #include "xe_gt.h"
 #include "xe_gt_ccs_mode.h"
+#include "xe_gt_pagefault.h"
 #include "xe_gt_printk.h"
 #include "xe_gt_mcr.h"
 #include "xe_gt_topology.h"
@@ -557,6 +558,9 @@  static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe,
 		hwe->eclass->defaults = hwe->eclass->sched_props;
 	}
 
+	hwe->pf.info = NULL;
+	spin_lock_init(&hwe->pf.lock);
+
 	xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt));
 	xe_tuning_process_engine(hwe);
 	xe_wa_process_engine(hwe);
diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h
index e4191a7a2c31..2e1be9481d9b 100644
--- a/drivers/gpu/drm/xe/xe_hw_engine_types.h
+++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h
@@ -64,6 +64,7 @@  enum xe_hw_engine_id {
 struct xe_bo;
 struct xe_execlist_port;
 struct xe_gt;
+struct pagefault;
 
 /**
  * struct xe_hw_engine_class_intf - per hw engine class struct interface
@@ -150,6 +151,13 @@  struct xe_hw_engine {
 	struct xe_oa_unit *oa_unit;
 	/** @hw_engine_group: the group of hw engines this one belongs to */
 	struct xe_hw_engine_group *hw_engine_group;
+	/** @pf: the last pagefault seen on this engine */
+	struct {
+		/** @pf.info: info containing last seen pagefault details */
+		struct pagefault *info;
+		/** @pf.lock: lock protecting @pf.info */
+		spinlock_t lock;
+	} pf;
 };
 
 enum xe_hw_engine_snapshot_source_id {