diff mbox series

[v4,3/3] drm/xe: Faster devcoredump

Message ID 20240731213221.2523989-4-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Faster devcoredump and fixes | expand

Commit Message

Matthew Brost July 31, 2024, 9:32 p.m. UTC
The current algorithm to read out devcoredump is O(N*N) where N is the
size of coredump due to usage of the drm_coredump_printer in
xe_devcoredump_read. Switch to a O(N) algorithm which prints the
devcoredump into a readable format in snapshot work and update
xe_devcoredump_read to memcpy from the readable format directly.

v2:
 - Fix double free on devcoredump removal (Testing)
 - Set read_data_size after snap work flush
 - Adjust remaining in iterator upon realloc (Testing)
 - Set read_data upon realloc (Testing)
v3:
 - Kernel doc

Reported-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2408
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 112 ++++++++++++++++------
 drivers/gpu/drm/xe/xe_devcoredump_types.h |   4 +
 2 files changed, 85 insertions(+), 31 deletions(-)

Comments

Cavitt, Jonathan July 31, 2024, 10:31 p.m. UTC | #1
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Wednesday, July 31, 2024 2:32 PM
To: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org
Cc: maarten.lankhorst@linux.intel.com; Vivi, Rodrigo <rodrigo.vivi@intel.com>
Subject: [PATCH v4 3/3] drm/xe: Faster devcoredump
> 
> The current algorithm to read out devcoredump is O(N*N) where N is the
> size of coredump due to usage of the drm_coredump_printer in
> xe_devcoredump_read. Switch to a O(N) algorithm which prints the
> devcoredump into a readable format in snapshot work and update
> xe_devcoredump_read to memcpy from the readable format directly.
> 
> v2:
>  - Fix double free on devcoredump removal (Testing)
>  - Set read_data_size after snap work flush
>  - Adjust remaining in iterator upon realloc (Testing)
>  - Set read_data upon realloc (Testing)
> v3:
>  - Kernel doc
> 
> Reported-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2408
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 112 ++++++++++++++++------
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |   4 +
>  2 files changed, 85 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index d8d8ca2c19d3..b802a35c22f2 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -66,22 +66,9 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>  	return &q->gt->uc.guc;
>  }
>  
> -static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> -{
> -	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
> -
> -	/* keep going if fw fails as we still want to save the memory and SW data */
> -	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> -		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
> -	xe_vm_snapshot_capture_delayed(ss->vm);
> -	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> -	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> -}
> -
> -static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> -				   size_t count, void *data, size_t datalen)
> +static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +				     struct xe_devcoredump *coredump)
>  {
> -	struct xe_devcoredump *coredump = data;
>  	struct xe_device *xe;
>  	struct xe_devcoredump_snapshot *ss;
>  	struct drm_printer p;
> @@ -89,18 +76,12 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	struct timespec64 ts;
>  	int i;
>  
> -	if (!coredump)
> -		return -ENODEV;
> -
>  	xe = coredump_to_xe(coredump);
>  	ss = &coredump->snapshot;
>  
> -	/* Ensure delayed work is captured before continuing */
> -	flush_work(&ss->work);
> -
>  	iter.data = buffer;
>  	iter.offset = 0;
> -	iter.start = offset;
> +	iter.start = 0;
>  	iter.remain = count;
>  
>  	p = drm_coredump_printer(&iter);
> @@ -131,13 +112,86 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	drm_printf(&p, "\n**** VM state ****\n");
>  	xe_vm_snapshot_print(coredump->snapshot.vm, &p);
>  
> -	return count - iter.remain;
> +	return iter.offset;
> +}
> +
> +static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> +{
> +	int i;
> +
> +	xe_guc_ct_snapshot_free(ss->ct);
> +	ss->ct = NULL;
> +
> +	xe_guc_exec_queue_snapshot_free(ss->ge);
> +	ss->ge = NULL;
> +
> +	xe_sched_job_snapshot_free(ss->job);
> +	ss->job = NULL;
> +
> +	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> +		if (ss->hwe[i]) {
> +			xe_hw_engine_snapshot_free(ss->hwe[i]);
> +			ss->hwe[i] = NULL;
> +		}
> +
> +	xe_vm_snapshot_free(ss->vm);
> +	ss->vm = NULL;
> +}
> +
> +static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> +{
> +	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
> +	struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot);
> +
> +	/* keep going if fw fails as we still want to save the memory and SW data */
> +	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> +		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
> +	xe_vm_snapshot_capture_delayed(ss->vm);
> +	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> +	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> +
> +	/* Calculate devcoredump size */
> +	ss->read_data_size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> +
> +	ss->read_data = kvmalloc(ss->read_data_size, GFP_USER);
> +	if (!ss->read_data)
> +		return;
> +
> +	__xe_devcoredump_read(ss->read_data, ss->read_data_size, coredump);
> +	xe_devcoredump_snapshot_free(ss);
> +}
> +
> +static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> +				   size_t count, void *data, size_t datalen)
> +{
> +	struct xe_devcoredump *coredump = data;
> +	struct xe_devcoredump_snapshot *ss;
> +	ssize_t byte_copied;
> +
> +	if (!coredump)
> +		return -ENODEV;
> +
> +	ss = &coredump->snapshot;
> +
> +	/* Ensure delayed work is captured before continuing */
> +	flush_work(&ss->work);
> +
> +	if (!ss->read_data)
> +		return -ENODEV;
> +
> +	if (offset >= ss->read_data_size)
> +		return 0;
> +
> +	byte_copied = count < ss->read_data_size - offset ? count :
> +		ss->read_data_size - offset;
> +	memcpy(buffer, ss->read_data + offset, byte_copied);
> +
> +	return byte_copied;
>  }
>  
>  static void xe_devcoredump_free(void *data)
>  {
>  	struct xe_devcoredump *coredump = data;
> -	int i;
>  
>  	/* Our device is gone. Nothing to do... */
>  	if (!data || !coredump_to_xe(coredump))
> @@ -145,13 +199,8 @@ static void xe_devcoredump_free(void *data)
>  
>  	cancel_work_sync(&coredump->snapshot.work);
>  
> -	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
> -	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
> -	xe_sched_job_snapshot_free(coredump->snapshot.job);
> -	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> -		if (coredump->snapshot.hwe[i])
> -			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
> -	xe_vm_snapshot_free(coredump->snapshot.vm);
> +	xe_devcoredump_snapshot_free(&coredump->snapshot);
> +	kvfree(coredump->snapshot.read_data);
>  
>  	/* To prevent stale data on next snapshot, clear everything */
>  	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
> @@ -260,4 +309,5 @@ int xe_devcoredump_init(struct xe_device *xe)
>  {
>  	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
>  }
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 923cdf72a816..0298037edae4 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -46,6 +46,10 @@ struct xe_devcoredump_snapshot {
>  	struct xe_sched_job_snapshot *job;
>  	/** @vm: Snapshot of VM state */
>  	struct xe_vm_snapshot *vm;
> +	/** @read_data_size: size of read data */
> +	ssize_t read_data_size;
> +	/** @read_data: Read data */
> +	void *read_data;

Perhaps the data size and read data can be stapled together under the same
struct?

struct {
	ssize_t size;
	void *data;
} read_data;

It's not that big of a deal if this is untouched, just something to consider.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

>  };
>  
>  /**
> -- 
> 2.34.1
> 
>
Matthew Brost Aug. 1, 2024, midnight UTC | #2
On Wed, Jul 31, 2024 at 04:31:01PM -0600, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
> Sent: Wednesday, July 31, 2024 2:32 PM
> To: intel-xe@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Cc: maarten.lankhorst@linux.intel.com; Vivi, Rodrigo <rodrigo.vivi@intel.com>
> Subject: [PATCH v4 3/3] drm/xe: Faster devcoredump
> > 
> > The current algorithm to read out devcoredump is O(N*N) where N is the
> > size of coredump due to usage of the drm_coredump_printer in
> > xe_devcoredump_read. Switch to a O(N) algorithm which prints the
> > devcoredump into a readable format in snapshot work and update
> > xe_devcoredump_read to memcpy from the readable format directly.
> > 
> > v2:
> >  - Fix double free on devcoredump removal (Testing)
> >  - Set read_data_size after snap work flush
> >  - Adjust remaining in iterator upon realloc (Testing)
> >  - Set read_data upon realloc (Testing)
> > v3:
> >  - Kernel doc
> > 
> > Reported-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> > Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2408
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_devcoredump.c       | 112 ++++++++++++++++------
> >  drivers/gpu/drm/xe/xe_devcoredump_types.h |   4 +
> >  2 files changed, 85 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > index d8d8ca2c19d3..b802a35c22f2 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > @@ -66,22 +66,9 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> >  	return &q->gt->uc.guc;
> >  }
> >  
> > -static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > -{
> > -	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
> > -
> > -	/* keep going if fw fails as we still want to save the memory and SW data */
> > -	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> > -		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
> > -	xe_vm_snapshot_capture_delayed(ss->vm);
> > -	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > -	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> > -}
> > -
> > -static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > -				   size_t count, void *data, size_t datalen)
> > +static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > +				     struct xe_devcoredump *coredump)
> >  {
> > -	struct xe_devcoredump *coredump = data;
> >  	struct xe_device *xe;
> >  	struct xe_devcoredump_snapshot *ss;
> >  	struct drm_printer p;
> > @@ -89,18 +76,12 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >  	struct timespec64 ts;
> >  	int i;
> >  
> > -	if (!coredump)
> > -		return -ENODEV;
> > -
> >  	xe = coredump_to_xe(coredump);
> >  	ss = &coredump->snapshot;
> >  
> > -	/* Ensure delayed work is captured before continuing */
> > -	flush_work(&ss->work);
> > -
> >  	iter.data = buffer;
> >  	iter.offset = 0;
> > -	iter.start = offset;
> > +	iter.start = 0;
> >  	iter.remain = count;
> >  
> >  	p = drm_coredump_printer(&iter);
> > @@ -131,13 +112,86 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >  	drm_printf(&p, "\n**** VM state ****\n");
> >  	xe_vm_snapshot_print(coredump->snapshot.vm, &p);
> >  
> > -	return count - iter.remain;
> > +	return iter.offset;
> > +}
> > +
> > +static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> > +{
> > +	int i;
> > +
> > +	xe_guc_ct_snapshot_free(ss->ct);
> > +	ss->ct = NULL;
> > +
> > +	xe_guc_exec_queue_snapshot_free(ss->ge);
> > +	ss->ge = NULL;
> > +
> > +	xe_sched_job_snapshot_free(ss->job);
> > +	ss->job = NULL;
> > +
> > +	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> > +		if (ss->hwe[i]) {
> > +			xe_hw_engine_snapshot_free(ss->hwe[i]);
> > +			ss->hwe[i] = NULL;
> > +		}
> > +
> > +	xe_vm_snapshot_free(ss->vm);
> > +	ss->vm = NULL;
> > +}
> > +
> > +static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > +{
> > +	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
> > +	struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot);
> > +
> > +	/* keep going if fw fails as we still want to save the memory and SW data */
> > +	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
> > +		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
> > +	xe_vm_snapshot_capture_delayed(ss->vm);
> > +	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > +	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
> > +
> > +	/* Calculate devcoredump size */
> > +	ss->read_data_size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > +
> > +	ss->read_data = kvmalloc(ss->read_data_size, GFP_USER);
> > +	if (!ss->read_data)
> > +		return;
> > +
> > +	__xe_devcoredump_read(ss->read_data, ss->read_data_size, coredump);
> > +	xe_devcoredump_snapshot_free(ss);
> > +}
> > +
> > +static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > +				   size_t count, void *data, size_t datalen)
> > +{
> > +	struct xe_devcoredump *coredump = data;
> > +	struct xe_devcoredump_snapshot *ss;
> > +	ssize_t byte_copied;
> > +
> > +	if (!coredump)
> > +		return -ENODEV;
> > +
> > +	ss = &coredump->snapshot;
> > +
> > +	/* Ensure delayed work is captured before continuing */
> > +	flush_work(&ss->work);
> > +
> > +	if (!ss->read_data)
> > +		return -ENODEV;
> > +
> > +	if (offset >= ss->read_data_size)
> > +		return 0;
> > +
> > +	byte_copied = count < ss->read_data_size - offset ? count :
> > +		ss->read_data_size - offset;
> > +	memcpy(buffer, ss->read_data + offset, byte_copied);
> > +
> > +	return byte_copied;
> >  }
> >  
> >  static void xe_devcoredump_free(void *data)
> >  {
> >  	struct xe_devcoredump *coredump = data;
> > -	int i;
> >  
> >  	/* Our device is gone. Nothing to do... */
> >  	if (!data || !coredump_to_xe(coredump))
> > @@ -145,13 +199,8 @@ static void xe_devcoredump_free(void *data)
> >  
> >  	cancel_work_sync(&coredump->snapshot.work);
> >  
> > -	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
> > -	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
> > -	xe_sched_job_snapshot_free(coredump->snapshot.job);
> > -	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
> > -		if (coredump->snapshot.hwe[i])
> > -			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
> > -	xe_vm_snapshot_free(coredump->snapshot.vm);
> > +	xe_devcoredump_snapshot_free(&coredump->snapshot);
> > +	kvfree(coredump->snapshot.read_data);
> >  
> >  	/* To prevent stale data on next snapshot, clear everything */
> >  	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
> > @@ -260,4 +309,5 @@ int xe_devcoredump_init(struct xe_device *xe)
> >  {
> >  	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
> >  }
> > +
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > index 923cdf72a816..0298037edae4 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > @@ -46,6 +46,10 @@ struct xe_devcoredump_snapshot {
> >  	struct xe_sched_job_snapshot *job;
> >  	/** @vm: Snapshot of VM state */
> >  	struct xe_vm_snapshot *vm;
> > +	/** @read_data_size: size of read data */
> > +	ssize_t read_data_size;
> > +	/** @read_data: Read data */
> > +	void *read_data;
> 
> Perhaps the data size and read data can be stapled together under the same
> struct?
> 
> struct {
> 	ssize_t size;
> 	void *data;
> } read_data;
> 

Yes, I like this better. Will change.

Matt

> It's not that big of a deal if this is untouched, just something to consider.
> Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> -Jonathan Cavitt
> 
> >  };
> >  
> >  /**
> > -- 
> > 2.34.1
> > 
> >
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index d8d8ca2c19d3..b802a35c22f2 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -66,22 +66,9 @@  static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
 	return &q->gt->uc.guc;
 }
 
-static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
-{
-	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
-
-	/* keep going if fw fails as we still want to save the memory and SW data */
-	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
-		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
-	xe_vm_snapshot_capture_delayed(ss->vm);
-	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
-	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
-}
-
-static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
-				   size_t count, void *data, size_t datalen)
+static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
+				     struct xe_devcoredump *coredump)
 {
-	struct xe_devcoredump *coredump = data;
 	struct xe_device *xe;
 	struct xe_devcoredump_snapshot *ss;
 	struct drm_printer p;
@@ -89,18 +76,12 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	struct timespec64 ts;
 	int i;
 
-	if (!coredump)
-		return -ENODEV;
-
 	xe = coredump_to_xe(coredump);
 	ss = &coredump->snapshot;
 
-	/* Ensure delayed work is captured before continuing */
-	flush_work(&ss->work);
-
 	iter.data = buffer;
 	iter.offset = 0;
-	iter.start = offset;
+	iter.start = 0;
 	iter.remain = count;
 
 	p = drm_coredump_printer(&iter);
@@ -131,13 +112,86 @@  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
 	drm_printf(&p, "\n**** VM state ****\n");
 	xe_vm_snapshot_print(coredump->snapshot.vm, &p);
 
-	return count - iter.remain;
+	return iter.offset;
+}
+
+static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
+{
+	int i;
+
+	xe_guc_ct_snapshot_free(ss->ct);
+	ss->ct = NULL;
+
+	xe_guc_exec_queue_snapshot_free(ss->ge);
+	ss->ge = NULL;
+
+	xe_sched_job_snapshot_free(ss->job);
+	ss->job = NULL;
+
+	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
+		if (ss->hwe[i]) {
+			xe_hw_engine_snapshot_free(ss->hwe[i]);
+			ss->hwe[i] = NULL;
+		}
+
+	xe_vm_snapshot_free(ss->vm);
+	ss->vm = NULL;
+}
+
+static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
+{
+	struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work);
+	struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot);
+
+	/* keep going if fw fails as we still want to save the memory and SW data */
+	if (xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL))
+		xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
+	xe_vm_snapshot_capture_delayed(ss->vm);
+	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
+	xe_force_wake_put(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL);
+
+	/* Calculate devcoredump size */
+	ss->read_data_size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
+
+	ss->read_data = kvmalloc(ss->read_data_size, GFP_USER);
+	if (!ss->read_data)
+		return;
+
+	__xe_devcoredump_read(ss->read_data, ss->read_data_size, coredump);
+	xe_devcoredump_snapshot_free(ss);
+}
+
+static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
+				   size_t count, void *data, size_t datalen)
+{
+	struct xe_devcoredump *coredump = data;
+	struct xe_devcoredump_snapshot *ss;
+	ssize_t byte_copied;
+
+	if (!coredump)
+		return -ENODEV;
+
+	ss = &coredump->snapshot;
+
+	/* Ensure delayed work is captured before continuing */
+	flush_work(&ss->work);
+
+	if (!ss->read_data)
+		return -ENODEV;
+
+	if (offset >= ss->read_data_size)
+		return 0;
+
+	byte_copied = count < ss->read_data_size - offset ? count :
+		ss->read_data_size - offset;
+	memcpy(buffer, ss->read_data + offset, byte_copied);
+
+	return byte_copied;
 }
 
 static void xe_devcoredump_free(void *data)
 {
 	struct xe_devcoredump *coredump = data;
-	int i;
 
 	/* Our device is gone. Nothing to do... */
 	if (!data || !coredump_to_xe(coredump))
@@ -145,13 +199,8 @@  static void xe_devcoredump_free(void *data)
 
 	cancel_work_sync(&coredump->snapshot.work);
 
-	xe_guc_ct_snapshot_free(coredump->snapshot.ct);
-	xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge);
-	xe_sched_job_snapshot_free(coredump->snapshot.job);
-	for (i = 0; i < XE_NUM_HW_ENGINES; i++)
-		if (coredump->snapshot.hwe[i])
-			xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]);
-	xe_vm_snapshot_free(coredump->snapshot.vm);
+	xe_devcoredump_snapshot_free(&coredump->snapshot);
+	kvfree(coredump->snapshot.read_data);
 
 	/* To prevent stale data on next snapshot, clear everything */
 	memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
@@ -260,4 +309,5 @@  int xe_devcoredump_init(struct xe_device *xe)
 {
 	return devm_add_action_or_reset(xe->drm.dev, xe_driver_devcoredump_fini, &xe->drm);
 }
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
index 923cdf72a816..0298037edae4 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -46,6 +46,10 @@  struct xe_devcoredump_snapshot {
 	struct xe_sched_job_snapshot *job;
 	/** @vm: Snapshot of VM state */
 	struct xe_vm_snapshot *vm;
+	/** @read_data_size: size of read data */
+	ssize_t read_data_size;
+	/** @read_data: Read data */
+	void *read_data;
 };
 
 /**