[v2,1/4] drm/xe: Add devcoredump chunking

Message ID	20250403202705.18488-2-matthew.brost@intel.com (mailing list archive)
State	New
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: Matthew Brost <matthew.brost@intel.com> To: intel-xe@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Subject: [PATCH v2 1/4] drm/xe: Add devcoredump chunking Date: Thu, 3 Apr 2025 13:27:02 -0700 Message-Id: <20250403202705.18488-2-matthew.brost@intel.com> In-Reply-To: <20250403202705.18488-1-matthew.brost@intel.com> References: <20250403202705.18488-1-matthew.brost@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	Large devcoredump file support \| expand [v2,0/4] Large devcoredump file support [v2,1/4] drm/xe: Add devcoredump chunking [v2,2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access [v2,3/4] drm/print: Add drm_printer_is_full [v2,4/4] drm/xe: Abort printing coredump in VM printer output if full

Matthew Brost April 3, 2025, 8:27 p.m. UTC

Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
callback as needed.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
 drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
 drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
 3 files changed, 50 insertions(+), 13 deletions(-)

Cavitt, Jonathan April 3, 2025, 9:28 p.m. UTC | #1

-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 1/4] drm/xe: Add devcoredump chunking
> 
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

I have no issues with this patch, though you should maybe get a second opinion.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
-Jonathan Cavitt

> ---
>  drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>  drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>  3 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>  	return &q->gt->uc.guc;
>  }
>  
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> +				     ssize_t start,
>  				     struct xe_devcoredump *coredump)
>  {
>  	struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>  	ss = &coredump->snapshot;
>  
>  	iter.data = buffer;
> -	iter.start = 0;
> +	iter.start = start;
>  	iter.remain = count;
>  
>  	p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>  	ss->vm = NULL;
>  }
>  
> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> +
>  static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  				   size_t count, void *data, size_t datalen)
>  {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  	/* Ensure delayed work is captured before continuing */
>  	flush_work(&ss->work);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
>  	mutex_lock(&coredump->lock);
>  
>  	if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>  		return 0;
>  	}
>  
> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> +	    offset < ss->read.chunk_position) {
> +		ss->read.chunk_position =
> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      ss->read.chunk_position, coredump);
> +	}
> +
>  	byte_copied = count < ss->read.size - offset ? count :
>  		ss->read.size - offset;
> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> +	memcpy(buffer, ss->read.buffer +
> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>  
>  	mutex_unlock(&coredump->lock);
>  
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
>  	return byte_copied;
>  }
>  
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>  	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>  	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>  
> -	xe_pm_runtime_put(xe);
> +	ss->read.chunk_position = 0;
>  
>  	/* Calculate devcoredump size */
> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> -	if (!ss->read.buffer)
> -		return;
> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> +					   GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      0, coredump);
> +	} else {
> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> +				      coredump);
> +		xe_devcoredump_snapshot_free(ss);
> +	}
>  
> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> -	xe_devcoredump_snapshot_free(ss);
> +put_pm:
> +	xe_pm_runtime_put(xe);
>  }
>  
>  static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>  	if (offset & 3)
>  		drm_printf(p, "Offset not word aligned: %zu", offset);
>  
> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>  	if (!line_buff) {
>  		drm_printf(p, "Failed to allocate line buffer\n");
>  		return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>  	struct {
>  		/** @read.size: size of devcoredump in human readable format */
>  		ssize_t size;
> +		/** @read.chunk_position: position of devcoredump chunk */
> +		ssize_t chunk_position;
>  		/** @read.buffer: buffer of devcoredump in human readable format */
>  		char *buffer;
>  	} read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>  	if (num_dw == 0)
>  		return -EINVAL;
>  
> -	hwconfig = kzalloc(size, GFP_KERNEL);
> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>  	if (!hwconfig)
>  		return -ENOMEM;
>  
> -- 
> 2.34.1
> 
>

John Harrison April 3, 2025, 9:39 p.m. UTC | #2

On 4/3/2025 1:27 PM, Matthew Brost wrote:
> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> callback as needed.
>
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>   drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>   drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>   drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>   3 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> index 81b9d9bb3f57..a9e618abf8ac 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>   	return &q->gt->uc.guc;
>   }
>   
> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> +				     ssize_t start,
>   				     struct xe_devcoredump *coredump)
>   {
>   	struct xe_device *xe;
> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>   	ss = &coredump->snapshot;
>   
>   	iter.data = buffer;
> -	iter.start = 0;
> +	iter.start = start;
>   	iter.remain = count;
>   
>   	p = drm_coredump_printer(&iter);
> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>   	ss->vm = NULL;
>   }
>   
> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> +
>   static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   				   size_t count, void *data, size_t datalen)
>   {
> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   	/* Ensure delayed work is captured before continuing */
>   	flush_work(&ss->work);
>   
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> +
>   	mutex_lock(&coredump->lock);
>   
>   	if (!ss->read.buffer) {
> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>   		return 0;
>   	}
>   
> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> +	    offset < ss->read.chunk_position) {
> +		ss->read.chunk_position =
> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      ss->read.chunk_position, coredump);
> +	}
> +
>   	byte_copied = count < ss->read.size - offset ? count :
>   		ss->read.size - offset;
> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> +	memcpy(buffer, ss->read.buffer +
> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>   
>   	mutex_unlock(&coredump->lock);
>   
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> +
>   	return byte_copied;
>   }
>   
> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>   	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>   	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>   
> -	xe_pm_runtime_put(xe);
> +	ss->read.chunk_position = 0;
>   
>   	/* Calculate devcoredump size */
> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> -
> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> -	if (!ss->read.buffer)
> -		return;
> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> +
> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> +					   GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer,
> +				      XE_DEVCOREDUMP_CHUNK_MAX,
> +				      0, coredump);
> +	} else {
> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> +		if (!ss->read.buffer)
> +			goto put_pm;
> +
> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> +				      coredump);
> +		xe_devcoredump_snapshot_free(ss);
> +	}
>   
> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> -	xe_devcoredump_snapshot_free(ss);
> +put_pm:
> +	xe_pm_runtime_put(xe);
>   }
>   
>   static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>   	if (offset & 3)
>   		drm_printf(p, "Offset not word aligned: %zu", offset);
>   
> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
Is this related? Or should it be a separate patch?

>   	if (!line_buff) {
>   		drm_printf(p, "Failed to allocate line buffer\n");
>   		return;
> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> index 1a1d16a96b2d..a174385a6d83 100644
> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>   	struct {
>   		/** @read.size: size of devcoredump in human readable format */
>   		ssize_t size;
> +		/** @read.chunk_position: position of devcoredump chunk */
> +		ssize_t chunk_position;
>   		/** @read.buffer: buffer of devcoredump in human readable format */
>   		char *buffer;
>   	} read;
> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> index af2c817d552c..21403a250834 100644
> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>   	if (num_dw == 0)
>   		return -EINVAL;
>   
> -	hwconfig = kzalloc(size, GFP_KERNEL);
> +	hwconfig = kzalloc(size, GFP_ATOMIC);
Likewise this?

John.

>   	if (!hwconfig)
>   		return -ENOMEM;
>

Matthew Brost April 3, 2025, 10:32 p.m. UTC | #3

On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > callback as needed.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
> >   drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
> >   drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
> >   3 files changed, 50 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > index 81b9d9bb3f57..a9e618abf8ac 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> >   	return &q->gt->uc.guc;
> >   }
> > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > +				     ssize_t start,
> >   				     struct xe_devcoredump *coredump)
> >   {
> >   	struct xe_device *xe;
> > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> >   	ss = &coredump->snapshot;
> >   	iter.data = buffer;
> > -	iter.start = 0;
> > +	iter.start = start;
> >   	iter.remain = count;
> >   	p = drm_coredump_printer(&iter);
> > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> >   	ss->vm = NULL;
> >   }
> > +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> > +
> >   static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   				   size_t count, void *data, size_t datalen)
> >   {
> > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   	/* Ensure delayed work is captured before continuing */
> >   	flush_work(&ss->work);
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> > +
> >   	mutex_lock(&coredump->lock);
> >   	if (!ss->read.buffer) {
> > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> >   		return 0;
> >   	}
> > +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > +	    offset < ss->read.chunk_position) {
> > +		ss->read.chunk_position =
> > +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > +
> > +		__xe_devcoredump_read(ss->read.buffer,
> > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > +				      ss->read.chunk_position, coredump);
> > +	}
> > +
> >   	byte_copied = count < ss->read.size - offset ? count :
> >   		ss->read.size - offset;
> > -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > +	memcpy(buffer, ss->read.buffer +
> > +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> >   	mutex_unlock(&coredump->lock);
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> > +
> >   	return byte_copied;
> >   }
> > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> >   	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> >   	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > -	xe_pm_runtime_put(xe);
> > +	ss->read.chunk_position = 0;
> >   	/* Calculate devcoredump size */
> > -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > -
> > -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > -	if (!ss->read.buffer)
> > -		return;
> > +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > +
> > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > +					   GFP_USER);
> > +		if (!ss->read.buffer)
> > +			goto put_pm;
> > +
> > +		__xe_devcoredump_read(ss->read.buffer,
> > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > +				      0, coredump);
> > +	} else {
> > +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > +		if (!ss->read.buffer)
> > +			goto put_pm;
> > +
> > +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > +				      coredump);
> > +		xe_devcoredump_snapshot_free(ss);
> > +	}
> > -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > -	xe_devcoredump_snapshot_free(ss);
> > +put_pm:
> > +	xe_pm_runtime_put(xe);
> >   }
> >   static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> >   	if (offset & 3)
> >   		drm_printf(p, "Offset not word aligned: %zu", offset);
> > -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> Is this related? Or should it be a separate patch?
> 

It is related. Now that __xe_devcoredump_read is called under
'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
calls xe_print_blob_ascii85.

Both cases the allocation is relatively small, so would be fairly
unlikely to fail.

I could make this a seperate prep patch if you think that would be
better.

> >   	if (!line_buff) {
> >   		drm_printf(p, "Failed to allocate line buffer\n");
> >   		return;
> > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > index 1a1d16a96b2d..a174385a6d83 100644
> > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> >   	struct {
> >   		/** @read.size: size of devcoredump in human readable format */
> >   		ssize_t size;
> > +		/** @read.chunk_position: position of devcoredump chunk */
> > +		ssize_t chunk_position;
> >   		/** @read.buffer: buffer of devcoredump in human readable format */
> >   		char *buffer;
> >   	} read;
> > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > index af2c817d552c..21403a250834 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> >   	if (num_dw == 0)
> >   		return -EINVAL;
> > -	hwconfig = kzalloc(size, GFP_KERNEL);
> > +	hwconfig = kzalloc(size, GFP_ATOMIC);
> Likewise this?
> 

Same as above.

Matt

> John.
> 
> >   	if (!hwconfig)
> >   		return -ENOMEM;
>

John Harrison April 3, 2025, 10:41 p.m. UTC | #4

On 4/3/2025 3:32 PM, Matthew Brost wrote:
> On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
>> On 4/3/2025 1:27 PM, Matthew Brost wrote:
>>> Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
>>> of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
>>> callback as needed.
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>> ---
>>>    drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
>>>    drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
>>>    drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
>>>    3 files changed, 50 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> index 81b9d9bb3f57..a9e618abf8ac 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump.c
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
>>> @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
>>>    	return &q->gt->uc.guc;
>>>    }
>>> -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>> +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
>>> +				     ssize_t start,
>>>    				     struct xe_devcoredump *coredump)
>>>    {
>>>    	struct xe_device *xe;
>>> @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
>>>    	ss = &coredump->snapshot;
>>>    	iter.data = buffer;
>>> -	iter.start = 0;
>>> +	iter.start = start;
>>>    	iter.remain = count;
>>>    	p = drm_coredump_printer(&iter);
>>> @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
>>>    	ss->vm = NULL;
>>>    }
>>> +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
>>> +
>>>    static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    				   size_t count, void *data, size_t datalen)
>>>    {
>>> @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    	/* Ensure delayed work is captured before continuing */
>>>    	flush_work(&ss->work);
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> +		xe_pm_runtime_get(gt_to_xe(ss->gt));
>>> +
>>>    	mutex_lock(&coredump->lock);
>>>    	if (!ss->read.buffer) {
>>> @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
>>>    		return 0;
>>>    	}
>>> +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
>>> +	    offset < ss->read.chunk_position) {
>>> +		ss->read.chunk_position =
>>> +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer,
>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>> +				      ss->read.chunk_position, coredump);
>>> +	}
>>> +
>>>    	byte_copied = count < ss->read.size - offset ? count :
>>>    		ss->read.size - offset;
>>> -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
>>> +	memcpy(buffer, ss->read.buffer +
>>> +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
>>>    	mutex_unlock(&coredump->lock);
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
>>> +		xe_pm_runtime_put(gt_to_xe(ss->gt));
>>> +
>>>    	return byte_copied;
>>>    }
>>> @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
>>>    	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
>>>    	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
>>> -	xe_pm_runtime_put(xe);
>>> +	ss->read.chunk_position = 0;
>>>    	/* Calculate devcoredump size */
>>> -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
>>> -
>>> -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> -	if (!ss->read.buffer)
>>> -		return;
>>> +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
>>> +
>>> +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
>>> +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
>>> +					   GFP_USER);
>>> +		if (!ss->read.buffer)
>>> +			goto put_pm;
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer,
>>> +				      XE_DEVCOREDUMP_CHUNK_MAX,
>>> +				      0, coredump);
>>> +	} else {
>>> +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
>>> +		if (!ss->read.buffer)
>>> +			goto put_pm;
>>> +
>>> +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
>>> +				      coredump);
>>> +		xe_devcoredump_snapshot_free(ss);
>>> +	}
>>> -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
>>> -	xe_devcoredump_snapshot_free(ss);
>>> +put_pm:
>>> +	xe_pm_runtime_put(xe);
>>>    }
>>>    static void devcoredump_snapshot(struct xe_devcoredump *coredump,
>>> @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
>>>    	if (offset & 3)
>>>    		drm_printf(p, "Offset not word aligned: %zu", offset);
>>> -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
>>> +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
>> Is this related? Or should it be a separate patch?
>>
> It is related. Now that __xe_devcoredump_read is called under
> 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> calls xe_print_blob_ascii85.
>
> Both cases the allocation is relatively small, so would be fairly
> unlikely to fail.
>
> I could make this a seperate prep patch if you think that would be
> better.
Not worth splitting if this is the requirement. But maybe just add a 
comment to the commit message about why it is necessary.

>
>>>    	if (!line_buff) {
>>>    		drm_printf(p, "Failed to allocate line buffer\n");
>>>    		return;
>>> diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> index 1a1d16a96b2d..a174385a6d83 100644
>>> --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
>>> @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
>>>    	struct {
>>>    		/** @read.size: size of devcoredump in human readable format */
>>>    		ssize_t size;
>>> +		/** @read.chunk_position: position of devcoredump chunk */
>>> +		ssize_t chunk_position;
>>>    		/** @read.buffer: buffer of devcoredump in human readable format */
>>>    		char *buffer;
>>>    	} read;
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> index af2c817d552c..21403a250834 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
>>> @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
>>>    	if (num_dw == 0)
>>>    		return -EINVAL;
>>> -	hwconfig = kzalloc(size, GFP_KERNEL);
>>> +	hwconfig = kzalloc(size, GFP_ATOMIC);
>> Likewise this?
Hmm, hadn't realised we were doing an alloc/copy/free for every config 
table access. Is that just to avoid doing iosys type reads because the 
table is in LMEM? Seems a lot of overhead for each individual u32 read!

But also why are we doing table reads during core dump -> text printing? 
AFAICT, the only reads are in the steering code for doing EU register 
reads. Don't all the reg reads happen up front and then we stop 
accessing the hardware while doing the chunked dump? If we are 
re-reading hardware state when doing the text conversion we can end up 
with inconsistent dumps as the state changes from one chunk to the next?

John.


>>
> Same as above.
>
> Matt
>
>> John.
>>
>>>    	if (!hwconfig)
>>>    		return -ENOMEM;

Matthew Brost April 4, 2025, 2:33 a.m. UTC | #5

On Thu, Apr 03, 2025 at 03:41:16PM -0700, John Harrison wrote:
> On 4/3/2025 3:32 PM, Matthew Brost wrote:
> > On Thu, Apr 03, 2025 at 02:39:16PM -0700, John Harrison wrote:
> > > On 4/3/2025 1:27 PM, Matthew Brost wrote:
> > > > Chunk devcoredump into 1.5G pieces to avoid hitting the kvmalloc limit
> > > > of 2G. Simple algorithm reads 1.5G at time in xe_devcoredump_read
> > > > callback as needed.
> > > > 
> > > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/xe/xe_devcoredump.c       | 59 ++++++++++++++++++-----
> > > >    drivers/gpu/drm/xe/xe_devcoredump_types.h |  2 +
> > > >    drivers/gpu/drm/xe/xe_guc_hwconfig.c      |  2 +-
> > > >    3 files changed, 50 insertions(+), 13 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > index 81b9d9bb3f57..a9e618abf8ac 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump.c
> > > > @@ -80,7 +80,8 @@ static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q)
> > > >    	return &q->gt->uc.guc;
> > > >    }
> > > > -static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > > +static ssize_t __xe_devcoredump_read(char *buffer, ssize_t count,
> > > > +				     ssize_t start,
> > > >    				     struct xe_devcoredump *coredump)
> > > >    {
> > > >    	struct xe_device *xe;
> > > > @@ -94,7 +95,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count,
> > > >    	ss = &coredump->snapshot;
> > > >    	iter.data = buffer;
> > > > -	iter.start = 0;
> > > > +	iter.start = start;
> > > >    	iter.remain = count;
> > > >    	p = drm_coredump_printer(&iter);
> > > > @@ -168,6 +169,8 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
> > > >    	ss->vm = NULL;
> > > >    }
> > > > +#define XE_DEVCOREDUMP_CHUNK_MAX	(SZ_512M + SZ_1G)
> > > > +
> > > >    static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    				   size_t count, void *data, size_t datalen)
> > > >    {
> > > > @@ -183,6 +186,9 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    	/* Ensure delayed work is captured before continuing */
> > > >    	flush_work(&ss->work);
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +		xe_pm_runtime_get(gt_to_xe(ss->gt));
> > > > +
> > > >    	mutex_lock(&coredump->lock);
> > > >    	if (!ss->read.buffer) {
> > > > @@ -195,12 +201,26 @@ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
> > > >    		return 0;
> > > >    	}
> > > > +	if (offset >= ss->read.chunk_position + XE_DEVCOREDUMP_CHUNK_MAX ||
> > > > +	    offset < ss->read.chunk_position) {
> > > > +		ss->read.chunk_position =
> > > > +			ALIGN_DOWN(offset, XE_DEVCOREDUMP_CHUNK_MAX);
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer,
> > > > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +				      ss->read.chunk_position, coredump);
> > > > +	}
> > > > +
> > > >    	byte_copied = count < ss->read.size - offset ? count :
> > > >    		ss->read.size - offset;
> > > > -	memcpy(buffer, ss->read.buffer + offset, byte_copied);
> > > > +	memcpy(buffer, ss->read.buffer +
> > > > +	       (offset % XE_DEVCOREDUMP_CHUNK_MAX), byte_copied);
> > > >    	mutex_unlock(&coredump->lock);
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX)
> > > > +		xe_pm_runtime_put(gt_to_xe(ss->gt));
> > > > +
> > > >    	return byte_copied;
> > > >    }
> > > > @@ -254,17 +274,32 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work)
> > > >    	xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
> > > >    	xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
> > > > -	xe_pm_runtime_put(xe);
> > > > +	ss->read.chunk_position = 0;
> > > >    	/* Calculate devcoredump size */
> > > > -	ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);
> > > > -
> > > > -	ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > -	if (!ss->read.buffer)
> > > > -		return;
> > > > +	ss->read.size = __xe_devcoredump_read(NULL, LONG_MAX, 0, coredump);
> > > > +
> > > > +	if (ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX) {
> > > > +		ss->read.buffer = kvmalloc(XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +					   GFP_USER);
> > > > +		if (!ss->read.buffer)
> > > > +			goto put_pm;
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer,
> > > > +				      XE_DEVCOREDUMP_CHUNK_MAX,
> > > > +				      0, coredump);
> > > > +	} else {
> > > > +		ss->read.buffer = kvmalloc(ss->read.size, GFP_USER);
> > > > +		if (!ss->read.buffer)
> > > > +			goto put_pm;
> > > > +
> > > > +		__xe_devcoredump_read(ss->read.buffer, ss->read.size, 0,
> > > > +				      coredump);
> > > > +		xe_devcoredump_snapshot_free(ss);
> > > > +	}
> > > > -	__xe_devcoredump_read(ss->read.buffer, ss->read.size, coredump);
> > > > -	xe_devcoredump_snapshot_free(ss);
> > > > +put_pm:
> > > > +	xe_pm_runtime_put(xe);
> > > >    }
> > > >    static void devcoredump_snapshot(struct xe_devcoredump *coredump,
> > > > @@ -425,7 +460,7 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffi
> > > >    	if (offset & 3)
> > > >    		drm_printf(p, "Offset not word aligned: %zu", offset);
> > > > -	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_KERNEL);
> > > > +	line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC);
> > > Is this related? Or should it be a separate patch?
> > > 
> > It is related. Now that __xe_devcoredump_read is called under
> > 'coredump->lock' we are in the path of reclaim, __xe_devcoredump_read
> > calls xe_print_blob_ascii85.
> > 
> > Both cases the allocation is relatively small, so would be fairly
> > unlikely to fail.
> > 
> > I could make this a seperate prep patch if you think that would be
> > better.
> Not worth splitting if this is the requirement. But maybe just add a comment
> to the commit message about why it is necessary.
> 

Sure.

> > 
> > > >    	if (!line_buff) {
> > > >    		drm_printf(p, "Failed to allocate line buffer\n");
> > > >    		return;
> > > > diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > index 1a1d16a96b2d..a174385a6d83 100644
> > > > --- a/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
> > > > @@ -66,6 +66,8 @@ struct xe_devcoredump_snapshot {
> > > >    	struct {
> > > >    		/** @read.size: size of devcoredump in human readable format */
> > > >    		ssize_t size;
> > > > +		/** @read.chunk_position: position of devcoredump chunk */
> > > > +		ssize_t chunk_position;
> > > >    		/** @read.buffer: buffer of devcoredump in human readable format */
> > > >    		char *buffer;
> > > >    	} read;
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > index af2c817d552c..21403a250834 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c
> > > > @@ -175,7 +175,7 @@ int xe_guc_hwconfig_lookup_u32(struct xe_guc *guc, u32 attribute, u32 *val)
> > > >    	if (num_dw == 0)
> > > >    		return -EINVAL;
> > > > -	hwconfig = kzalloc(size, GFP_KERNEL);
> > > > +	hwconfig = kzalloc(size, GFP_ATOMIC);
> > > Likewise this?
> Hmm, hadn't realised we were doing an alloc/copy/free for every config table
> access. Is that just to avoid doing iosys type reads because the table is in
> LMEM? Seems a lot of overhead for each individual u32 read!
> 
> But also why are we doing table reads during core dump -> text printing?
> AFAICT, the only reads are in the steering code for doing EU register reads.
> Don't all the reg reads happen up front and then we stop accessing the
> hardware while doing the chunked dump? If we are re-reading hardware state
> when doing the text conversion we can end up with inconsistent dumps as the
> state changes from one chunk to the next?
> 

Honestly not familiar with this code at all, lockdep puked and just fixed it. :)

On its surface, it does like an unnecessary allocation.

Matt

> John.
> 
> 
> > > 
> > Same as above.
> > 
> > Matt
> > 
> > > John.
> > > 
> > > >    	if (!hwconfig)
> > > >    		return -ENOMEM;
>

[v2,1/4] drm/xe: Add devcoredump chunking

Commit Message

Comments

Patch