diff mbox series

[v2,2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access

Message ID 20250403202705.18488-3-matthew.brost@intel.com (mailing list archive)
State New
Headers show
Series Large devcoredump file support | expand

Commit Message

Matthew Brost April 3, 2025, 8:27 p.m. UTC
Add migrate layer functions to access VRAM and update
xe_ttm_access_memory to use for non-visible access and large (more than
16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
3s GPU copy time.

v4:
 - Fix non-page aligned accesses
 - Add support for small / unaligned access
 - Update commit message indicating migrate used for large accesses (Auld)
 - Fix warning in xe_res_cursor for non-zero offset
v5:
 - Fix 32 bit build (CI)
v6:
 - Rebase and use SVM migration copy functions
v7:
 - Fix build error (CI)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
 drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_migrate.h |   4 +
 3 files changed, 223 insertions(+), 17 deletions(-)

Comments

Cavitt, Jonathan April 3, 2025, 9:28 p.m. UTC | #1
-----Original Message-----
From: Intel-xe <intel-xe-bounces@lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Thursday, April 3, 2025 1:27 PM
To: intel-xe@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Subject: [PATCH v2 2/4] drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access
> 
> Add migrate layer functions to access VRAM and update
> xe_ttm_access_memory to use for non-visible access and large (more than
> 16k) BO access. 8G devcoreump on BMG observed 3 minute CPU copy time vs.
> 3s GPU copy time.
> 
> v4:
>  - Fix non-page aligned accesses
>  - Add support for small / unaligned access
>  - Update commit message indicating migrate used for large accesses (Auld)
>  - Fix warning in xe_res_cursor for non-zero offset
> v5:
>  - Fix 32 bit build (CI)
> v6:
>  - Rebase and use SVM migration copy functions
> v7:
>  - Fix build error (CI)
> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>

I left some notes down below.  Most are non-blocking, which I labeled as such.  And
as for the rest, it's probable that I'm just misunderstanding some parts of the code, and
that the notes I left are not relevant.

So, I'll be providing my reviewed-by now in the case that the blocking review notes
turn out to not need to be addressed.  And in the case they need to be addressed,
you can take my reviewed-by for the next revision.

Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>

> ---
>  drivers/gpu/drm/xe/xe_bo.c      |  15 ++-
>  drivers/gpu/drm/xe/xe_migrate.c | 221 ++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_migrate.h |   4 +
>  3 files changed, 223 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 3c7c2353d3c8..c7e6b03d4aef 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -1414,6 +1414,7 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  	struct xe_res_cursor cursor;
>  	struct xe_vram_region *vram;
>  	int bytes_left = len;
> +	int err = 0;
>  
>  	xe_bo_assert_held(bo);
>  	xe_device_assert_mem_access(xe);
> @@ -1421,9 +1422,14 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
>  		return -EIO;
>  
> -	/* FIXME: Use GPU for non-visible VRAM */
> -	if (!xe_ttm_resource_visible(ttm_bo->resource))
> -		return -EIO;
> +	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
> +		struct xe_migrate *migrate =
> +			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
> +
> +		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
> +					       write);
> +		goto out;
> +	}
>  
>  	vram = res_to_mem_region(ttm_bo->resource);
>  	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
> @@ -1447,7 +1453,8 @@ static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
>  			xe_res_next(&cursor, PAGE_SIZE);
>  	} while (bytes_left);
>  
> -	return len;
> +out:
> +	return err ?: len;
>  }
>  
>  const struct ttm_device_funcs xe_ttm_funcs = {
> diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
> index ff0fc2fb0eb9..ba8568691d99 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.c
> +++ b/drivers/gpu/drm/xe/xe_migrate.c
> @@ -670,6 +670,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
>  	u32 mocs = 0;
>  	u32 tile_y = 0;
>  
> +	xe_gt_assert(gt, !(pitch & 3));
>  	xe_gt_assert(gt, size / pitch <= S16_MAX);
>  	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
>  	xe_gt_assert(gt, pitch <= U16_MAX);
> @@ -1602,8 +1603,12 @@ enum xe_migrate_copy_dir {
>  	XE_MIGRATE_COPY_TO_SRAM,
>  };
>  
> +#define XE_CACHELINE_BYTES	64ull
> +#define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
> +
>  static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
> -					 unsigned long npages,
> +					 unsigned long len,
> +					 unsigned long sram_offset,
>  					 dma_addr_t *sram_addr, u64 vram_addr,
>  					 const enum xe_migrate_copy_dir dir)
>  {
> @@ -1613,17 +1618,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
>  	struct dma_fence *fence = NULL;
>  	u32 batch_size = 2;
>  	u64 src_L0_ofs, dst_L0_ofs;
> -	u64 round_update_size;
>  	struct xe_sched_job *job;
>  	struct xe_bb *bb;
>  	u32 update_idx, pt_slot = 0;
> +	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
> +	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
> +		PAGE_SIZE : 4;
>  	int err;
>  
> -	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
> -		return ERR_PTR(-EINVAL);
> +	if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
> +			(sram_offset | vram_addr) & XE_CACHELINE_MASK))
> +		return ERR_PTR(-EOPNOTSUPP);
>  
> -	round_update_size = npages * PAGE_SIZE;
> -	batch_size += pte_update_cmd_size(round_update_size);
> +	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);

Hmm... Does this case still need to return an error value?  I don't think replacing it with an
xe_assert is entirely valid, as asserts should not be used as a replacement of proper error
handling.  Or so I've been told.

So, I guess the question is: what error was this preventing previously, and is it still a concern?

> +
> +	batch_size += pte_update_cmd_size(len);
>  	batch_size += EMIT_COPY_DW;
>  
>  	bb = xe_bb_new(gt, batch_size, use_usm_batch);
> @@ -1633,22 +1642,21 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
>  	}
>  
>  	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
> -				   sram_addr, round_update_size);
> +				   sram_addr, len + sram_offset);
>  
>  	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
> -		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> +		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
>  		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
>  
>  	} else {
>  		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
> -		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
> +		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
>  	}
>  
>  	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>  	update_idx = bb->len;
>  
> -	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
> -		  XE_PAGE_SIZE);
> +	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
>  
>  	job = xe_bb_create_migration_job(m->q, bb,
>  					 xe_migrate_batch_base(m, use_usm_batch),
> @@ -1696,7 +1704,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
>  				     dma_addr_t *src_addr,
>  				     u64 dst_addr)
>  {
> -	return xe_migrate_vram(m, npages, src_addr, dst_addr,
> +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
>  			       XE_MIGRATE_COPY_TO_VRAM);
>  }
>  
> @@ -1717,12 +1725,199 @@ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
>  				       u64 src_addr,
>  				       dma_addr_t *dst_addr)
>  {
> -	return xe_migrate_vram(m, npages, dst_addr, src_addr,
> +	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
>  			       XE_MIGRATE_COPY_TO_SRAM);
>  }
>  
>  #endif
>  
> +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
> +				 int len, int write)
> +{
> +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (!dma_addr[i])
> +			continue;

Non-blocking:
I'm guessing the dma_addr array can be non-contiguous (I.E. position 3 having a
dma address does not imply positions 1 and 2 have them).  Because otherwise
this can be a break condition instead of a continue.

> +
> +		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
> +			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
> +	}
> +	kfree(dma_addr);
> +}
> +
> +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
> +				      void *buf, int len, int write)
> +{
> +	dma_addr_t *dma_addr;
> +	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
> +
> +	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
> +	if (!dma_addr)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for (i = 0; i < npages; ++i) {
> +		dma_addr_t addr;
> +		struct page *page;
> +
> +		if (is_vmalloc_addr(buf))
> +			page = vmalloc_to_page(buf);
> +		else
> +			page = virt_to_page(buf);
> +
> +		addr = dma_map_page(xe->drm.dev,
> +				    page, 0, PAGE_SIZE,
> +				    write ? DMA_TO_DEVICE :
> +				    DMA_FROM_DEVICE);
> +		if (dma_mapping_error(xe->drm.dev, addr))
> +			goto err_fault;
> +
> +		dma_addr[i] = addr;
> +		buf += PAGE_SIZE;
> +	}
> +
> +	return dma_addr;
> +
> +err_fault:
> +	xe_migrate_dma_unmap(xe, dma_addr, len, write);
> +	return ERR_PTR(-EFAULT);
> +}
> +
> +/**
> + * xe_migrate_access_memory - Access memory of a BO via GPU
> + *
> + * @m: The migration context.
> + * @bo: buffer object
> + * @offset: access offset into buffer object
> + * @buf: pointer to caller memory to read into or write from
> + * @len: length of access
> + * @write: write access
> + *
> + * Access memory of a BO via GPU either reading in or writing from a passed in
> + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
> + * read to or write from pointer.
> + *
> + * Returns:
> + * 0 if successful, negative error code on failure.
> + */
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> +			     unsigned long offset, void *buf, int len,
> +			     int write)
> +{
> +	struct xe_tile *tile = m->tile;
> +	struct xe_device *xe = tile_to_xe(tile);
> +	struct xe_res_cursor cursor;
> +	struct dma_fence *fence = NULL;
> +	dma_addr_t *dma_addr;
> +	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
> +	int bytes_left = len, current_page = 0;
> +	void *orig_buf = buf;
> +
> +	xe_bo_assert_held(bo);
> +
> +	/* Use bounce buffer for small access and unaligned access */
> +	if (len & XE_CACHELINE_MASK ||
> +	    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
> +		int buf_offset = 0;
> +
> +		/*
> +		 * Less than ideal for large unaligned access but this should be
> +		 * fairly rare, can fixup if this becomes common.
> +		 */
> +		do {
> +			u8 bounce[XE_CACHELINE_BYTES];
> +			void *ptr = (void *)bounce;
> +			int err;
> +			int copy_bytes = min_t(int, bytes_left,
> +					       XE_CACHELINE_BYTES -
> +					       (offset & XE_CACHELINE_MASK));
> +			int ptr_offset = offset & XE_CACHELINE_MASK;
> +
> +			err = xe_migrate_access_memory(m, bo,
> +						       offset &
> +						       ~XE_CACHELINE_MASK,
> +						       (void *)ptr,
> +						       sizeof(bounce), 0);
> +			if (err)
> +				return err;
> +
> +			if (!write) {
> +				memcpy(buf + buf_offset, ptr + ptr_offset,
> +				       copy_bytes);
> +				goto next;
> +			}

Non-blocking:
This should debatably be an if-else block, rather than relying on a "goto next"
here, as "goto next" is not used elsewhere in the function.

"""
	if (write) {
		memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
		err = xe_migrate_access_memory(m, bo,
						    offset & ~XE_CACHELINE_MASK,
						    (void *)ptr,
						    sizeof(bounce), 0);
	} else {
		memcpy(buf + buf_offset, ptr + ptr_offset,
			  copy_bytes);
	}

	if (err)
		return err;
"""

> +
> +			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
> +			err = xe_migrate_access_memory(m, bo,
> +						       offset & ~XE_CACHELINE_MASK,
> +						       (void *)ptr,
> +						       sizeof(bounce), 0);
> +			if (err)
> +				return err;
> +
> +next:
> +			bytes_left -= copy_bytes;
> +			buf_offset += copy_bytes;
> +			offset += copy_bytes;
> +		} while (bytes_left);
> +
> +		return 0;
> +	}
> +
> +	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
> +	if (IS_ERR(dma_addr))
> +		return PTR_ERR(dma_addr);
> +
> +	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
> +
> +	do {
> +		struct dma_fence *__fence;
> +		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
> +			cursor.start;
> +		int current_bytes;
> +
> +		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
> +			current_bytes = min_t(int, bytes_left,
> +					      MAX_PREEMPTDISABLE_TRANSFER);
> +		else
> +			current_bytes = min_t(int, bytes_left, cursor.size);
> +
> +		if (fence)
> +			dma_fence_put(fence);
> +
> +		__fence = xe_migrate_vram(m, current_bytes,
> +					  (unsigned long)buf & ~PAGE_MASK,
> +					  dma_addr + current_page,
> +					  vram_addr, write ?
> +					  XE_MIGRATE_COPY_TO_VRAM :
> +					  XE_MIGRATE_COPY_TO_SRAM);
> +		if (IS_ERR(__fence)) {
> +			if (fence)
> +				dma_fence_wait(fence, false);
> +			fence = __fence;
> +			goto out_err;
> +		}
> +		fence = __fence;

Non-blocking:
It would be nice if we didn't have two paths assigning fence the __fence value,
but any attempt I do to try and change that ends in having to compute the IS_ERR
value for __fence twice.  E.G.:

"""
	if (IS_ERR(__fence) && fence)
		dma_fence_wait(fence, false);
	fence = __fence;
	if (IS_ERR(fence))
		goto out_err;
"""

So, either way works.

> +
> +		buf += current_bytes;
> +		offset += current_bytes;
> +		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
> +		bytes_left -= current_bytes;
> +		if (bytes_left)
> +			xe_res_next(&cursor, current_bytes);
> +	} while (bytes_left);
> +
> +	dma_fence_wait(fence, false);
> +	dma_fence_put(fence);
> +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> +
> +	return 0;
> +
> +out_err:
> +	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
> +	return PTR_ERR(fence);

Non-blocking:
The branching execution might be able to be shortened.  Perhaps:
"""
	dma_fence_wait(fence, false);
	dma_fence_put(fence);

out_err:
	xe_migtate_dma_unmap(xe, dma_addr, len + page_offset, write);
	return IS_ERR(fence) ? PTR_ERR(fence) : 0;
}
"""
-Jonathan Cavitt

> +}
> +
>  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
>  #include "tests/xe_migrate.c"
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
> index 6ff9a963425c..fb9839c1bae0 100644
> --- a/drivers/gpu/drm/xe/xe_migrate.h
> +++ b/drivers/gpu/drm/xe/xe_migrate.h
> @@ -112,6 +112,10 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
>  				  struct ttm_resource *dst,
>  				  bool copy_only_ccs);
>  
> +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
> +			     unsigned long offset, void *buf, int len,
> +			     int write);
> +
>  #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
>  #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
>  #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \
> -- 
> 2.34.1
> 
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 3c7c2353d3c8..c7e6b03d4aef 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -1414,6 +1414,7 @@  static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	struct xe_res_cursor cursor;
 	struct xe_vram_region *vram;
 	int bytes_left = len;
+	int err = 0;
 
 	xe_bo_assert_held(bo);
 	xe_device_assert_mem_access(xe);
@@ -1421,9 +1422,14 @@  static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 	if (!mem_type_is_vram(ttm_bo->resource->mem_type))
 		return -EIO;
 
-	/* FIXME: Use GPU for non-visible VRAM */
-	if (!xe_ttm_resource_visible(ttm_bo->resource))
-		return -EIO;
+	if (!xe_ttm_resource_visible(ttm_bo->resource) || len >= SZ_16K) {
+		struct xe_migrate *migrate =
+			mem_type_to_migrate(xe, ttm_bo->resource->mem_type);
+
+		err = xe_migrate_access_memory(migrate, bo, offset, buf, len,
+					       write);
+		goto out;
+	}
 
 	vram = res_to_mem_region(ttm_bo->resource);
 	xe_res_first(ttm_bo->resource, offset & PAGE_MASK,
@@ -1447,7 +1453,8 @@  static int xe_ttm_access_memory(struct ttm_buffer_object *ttm_bo,
 			xe_res_next(&cursor, PAGE_SIZE);
 	} while (bytes_left);
 
-	return len;
+out:
+	return err ?: len;
 }
 
 const struct ttm_device_funcs xe_ttm_funcs = {
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index ff0fc2fb0eb9..ba8568691d99 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -670,6 +670,7 @@  static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
 	u32 mocs = 0;
 	u32 tile_y = 0;
 
+	xe_gt_assert(gt, !(pitch & 3));
 	xe_gt_assert(gt, size / pitch <= S16_MAX);
 	xe_gt_assert(gt, pitch / 4 <= S16_MAX);
 	xe_gt_assert(gt, pitch <= U16_MAX);
@@ -1602,8 +1603,12 @@  enum xe_migrate_copy_dir {
 	XE_MIGRATE_COPY_TO_SRAM,
 };
 
+#define XE_CACHELINE_BYTES	64ull
+#define XE_CACHELINE_MASK	(XE_CACHELINE_BYTES - 1)
+
 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
-					 unsigned long npages,
+					 unsigned long len,
+					 unsigned long sram_offset,
 					 dma_addr_t *sram_addr, u64 vram_addr,
 					 const enum xe_migrate_copy_dir dir)
 {
@@ -1613,17 +1618,21 @@  static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	struct dma_fence *fence = NULL;
 	u32 batch_size = 2;
 	u64 src_L0_ofs, dst_L0_ofs;
-	u64 round_update_size;
 	struct xe_sched_job *job;
 	struct xe_bb *bb;
 	u32 update_idx, pt_slot = 0;
+	unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
+	unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
+		PAGE_SIZE : 4;
 	int err;
 
-	if (npages * PAGE_SIZE > MAX_PREEMPTDISABLE_TRANSFER)
-		return ERR_PTR(-EINVAL);
+	if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
+			(sram_offset | vram_addr) & XE_CACHELINE_MASK))
+		return ERR_PTR(-EOPNOTSUPP);
 
-	round_update_size = npages * PAGE_SIZE;
-	batch_size += pte_update_cmd_size(round_update_size);
+	xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
+
+	batch_size += pte_update_cmd_size(len);
 	batch_size += EMIT_COPY_DW;
 
 	bb = xe_bb_new(gt, batch_size, use_usm_batch);
@@ -1633,22 +1642,21 @@  static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
 	}
 
 	build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
-				   sram_addr, round_update_size);
+				   sram_addr, len + sram_offset);
 
 	if (dir == XE_MIGRATE_COPY_TO_VRAM) {
-		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 		dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
 
 	} else {
 		src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
-		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0);
+		dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
 	}
 
 	bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
 	update_idx = bb->len;
 
-	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, round_update_size,
-		  XE_PAGE_SIZE);
+	emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
 
 	job = xe_bb_create_migration_job(m->q, bb,
 					 xe_migrate_batch_base(m, use_usm_batch),
@@ -1696,7 +1704,7 @@  struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
 				     dma_addr_t *src_addr,
 				     u64 dst_addr)
 {
-	return xe_migrate_vram(m, npages, src_addr, dst_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
 			       XE_MIGRATE_COPY_TO_VRAM);
 }
 
@@ -1717,12 +1725,199 @@  struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
 				       u64 src_addr,
 				       dma_addr_t *dst_addr)
 {
-	return xe_migrate_vram(m, npages, dst_addr, src_addr,
+	return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
 			       XE_MIGRATE_COPY_TO_SRAM);
 }
 
 #endif
 
+static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr,
+				 int len, int write)
+{
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	for (i = 0; i < npages; ++i) {
+		if (!dma_addr[i])
+			continue;
+
+		dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE,
+			       write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	}
+	kfree(dma_addr);
+}
+
+static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe,
+				      void *buf, int len, int write)
+{
+	dma_addr_t *dma_addr;
+	unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL);
+	if (!dma_addr)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < npages; ++i) {
+		dma_addr_t addr;
+		struct page *page;
+
+		if (is_vmalloc_addr(buf))
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		addr = dma_map_page(xe->drm.dev,
+				    page, 0, PAGE_SIZE,
+				    write ? DMA_TO_DEVICE :
+				    DMA_FROM_DEVICE);
+		if (dma_mapping_error(xe->drm.dev, addr))
+			goto err_fault;
+
+		dma_addr[i] = addr;
+		buf += PAGE_SIZE;
+	}
+
+	return dma_addr;
+
+err_fault:
+	xe_migrate_dma_unmap(xe, dma_addr, len, write);
+	return ERR_PTR(-EFAULT);
+}
+
+/**
+ * xe_migrate_access_memory - Access memory of a BO via GPU
+ *
+ * @m: The migration context.
+ * @bo: buffer object
+ * @offset: access offset into buffer object
+ * @buf: pointer to caller memory to read into or write from
+ * @len: length of access
+ * @write: write access
+ *
+ * Access memory of a BO via GPU either reading in or writing from a passed in
+ * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
+ * read to or write from pointer.
+ *
+ * Returns:
+ * 0 if successful, negative error code on failure.
+ */
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write)
+{
+	struct xe_tile *tile = m->tile;
+	struct xe_device *xe = tile_to_xe(tile);
+	struct xe_res_cursor cursor;
+	struct dma_fence *fence = NULL;
+	dma_addr_t *dma_addr;
+	unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
+	int bytes_left = len, current_page = 0;
+	void *orig_buf = buf;
+
+	xe_bo_assert_held(bo);
+
+	/* Use bounce buffer for small access and unaligned access */
+	if (len & XE_CACHELINE_MASK ||
+	    ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) {
+		int buf_offset = 0;
+
+		/*
+		 * Less than ideal for large unaligned access but this should be
+		 * fairly rare, can fixup if this becomes common.
+		 */
+		do {
+			u8 bounce[XE_CACHELINE_BYTES];
+			void *ptr = (void *)bounce;
+			int err;
+			int copy_bytes = min_t(int, bytes_left,
+					       XE_CACHELINE_BYTES -
+					       (offset & XE_CACHELINE_MASK));
+			int ptr_offset = offset & XE_CACHELINE_MASK;
+
+			err = xe_migrate_access_memory(m, bo,
+						       offset &
+						       ~XE_CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+			if (!write) {
+				memcpy(buf + buf_offset, ptr + ptr_offset,
+				       copy_bytes);
+				goto next;
+			}
+
+			memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes);
+			err = xe_migrate_access_memory(m, bo,
+						       offset & ~XE_CACHELINE_MASK,
+						       (void *)ptr,
+						       sizeof(bounce), 0);
+			if (err)
+				return err;
+
+next:
+			bytes_left -= copy_bytes;
+			buf_offset += copy_bytes;
+			offset += copy_bytes;
+		} while (bytes_left);
+
+		return 0;
+	}
+
+	dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
+	if (IS_ERR(dma_addr))
+		return PTR_ERR(dma_addr);
+
+	xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor);
+
+	do {
+		struct dma_fence *__fence;
+		u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
+			cursor.start;
+		int current_bytes;
+
+		if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
+			current_bytes = min_t(int, bytes_left,
+					      MAX_PREEMPTDISABLE_TRANSFER);
+		else
+			current_bytes = min_t(int, bytes_left, cursor.size);
+
+		if (fence)
+			dma_fence_put(fence);
+
+		__fence = xe_migrate_vram(m, current_bytes,
+					  (unsigned long)buf & ~PAGE_MASK,
+					  dma_addr + current_page,
+					  vram_addr, write ?
+					  XE_MIGRATE_COPY_TO_VRAM :
+					  XE_MIGRATE_COPY_TO_SRAM);
+		if (IS_ERR(__fence)) {
+			if (fence)
+				dma_fence_wait(fence, false);
+			fence = __fence;
+			goto out_err;
+		}
+		fence = __fence;
+
+		buf += current_bytes;
+		offset += current_bytes;
+		current_page = (int)(buf - orig_buf) / PAGE_SIZE;
+		bytes_left -= current_bytes;
+		if (bytes_left)
+			xe_res_next(&cursor, current_bytes);
+	} while (bytes_left);
+
+	dma_fence_wait(fence, false);
+	dma_fence_put(fence);
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+
+	return 0;
+
+out_err:
+	xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write);
+	return PTR_ERR(fence);
+}
+
 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
 #include "tests/xe_migrate.c"
 #endif
diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h
index 6ff9a963425c..fb9839c1bae0 100644
--- a/drivers/gpu/drm/xe/xe_migrate.h
+++ b/drivers/gpu/drm/xe/xe_migrate.h
@@ -112,6 +112,10 @@  struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
 				  struct ttm_resource *dst,
 				  bool copy_only_ccs);
 
+int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
+			     unsigned long offset, void *buf, int len,
+			     int write);
+
 #define XE_MIGRATE_CLEAR_FLAG_BO_DATA		BIT(0)
 #define XE_MIGRATE_CLEAR_FLAG_CCS_DATA		BIT(1)
 #define XE_MIGRATE_CLEAR_FLAG_FULL	(XE_MIGRATE_CLEAR_FLAG_BO_DATA | \