diff mbox series

[v2,10/37] drm/i915/blt: support copying objects

Message ID 20190627205633.1143-11-matthew.auld@intel.com (mailing list archive)
State New, archived
Headers show
Series Introduce memory region concept (including device local memory) | expand

Commit Message

Matthew Auld June 27, 2019, 8:56 p.m. UTC
We can already clear an object with the blt, so try to do the same to
support copying from one object backing store to another. Really this is
just object -> object, which is not that useful yet, what we really want
is two backing stores, but that will require some vma rework first,
otherwise we are stuck with "tmp" objects.

Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com
---
 .../gpu/drm/i915/gem/i915_gem_object_blt.c    | 135 ++++++++++++++++++
 .../gpu/drm/i915/gem/i915_gem_object_blt.h    |   8 ++
 .../i915/gem/selftests/i915_gem_object_blt.c  | 105 ++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |   3 +-
 4 files changed, 250 insertions(+), 1 deletion(-)

Comments

Chris Wilson June 27, 2019, 11:35 p.m. UTC | #1
Quoting Matthew Auld (2019-06-27 21:56:06)
> We can already clear an object with the blt, so try to do the same to
> support copying from one object backing store to another. Really this is
> just object -> object, which is not that useful yet, what we really want
> is two backing stores, but that will require some vma rework first,
> otherwise we are stuck with "tmp" objects.
> 
> Signed-off-by: Matthew Auld <matthew.auld@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com
> ---
>  .../gpu/drm/i915/gem/i915_gem_object_blt.c    | 135 ++++++++++++++++++
>  .../gpu/drm/i915/gem/i915_gem_object_blt.h    |   8 ++
>  .../i915/gem/selftests/i915_gem_object_blt.c  | 105 ++++++++++++++
>  drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |   3 +-
>  4 files changed, 250 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> index cb42e3a312e2..c2b28e06c379 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
> @@ -102,6 +102,141 @@ int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
>         return err;
>  }
>  
> +int intel_emit_vma_copy_blt(struct i915_request *rq,
> +                           struct i915_vma *src,
> +                           struct i915_vma *dst)
> +{
> +       const int gen = INTEL_GEN(rq->i915);
> +       u32 *cs;
> +
> +       GEM_BUG_ON(src->size != dst->size);

For a low level interface, I would suggest a little over engineering and
take src_offset, dst_offset, length. For bonus points, 2D -- but I
accept that may be too much over-engineering without a user.

> +       cs = intel_ring_begin(rq, 10);
> +       if (IS_ERR(cs))
> +               return PTR_ERR(cs);
> +
> +       if (gen >= 9) {
> +               *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10-2);
> +               *cs++ = BLT_DEPTH_32 | PAGE_SIZE;
> +               *cs++ = 0;
> +               *cs++ = src->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +               *cs++ = lower_32_bits(dst->node.start);
> +               *cs++ = upper_32_bits(dst->node.start);
> +               *cs++ = 0;
> +               *cs++ = PAGE_SIZE;
> +               *cs++ = lower_32_bits(src->node.start);
> +               *cs++ = upper_32_bits(src->node.start);

Reminds me that we didn't fix the earlier routines to handle more than
32k pages either. Please add a test case :)
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
index cb42e3a312e2..c2b28e06c379 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c
@@ -102,6 +102,141 @@  int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
 	return err;
 }
 
+int intel_emit_vma_copy_blt(struct i915_request *rq,
+			    struct i915_vma *src,
+			    struct i915_vma *dst)
+{
+	const int gen = INTEL_GEN(rq->i915);
+	u32 *cs;
+
+	GEM_BUG_ON(src->size != dst->size);
+
+	cs = intel_ring_begin(rq, 10);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (gen >= 9) {
+		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10-2);
+		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = src->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = lower_32_bits(dst->node.start);
+		*cs++ = upper_32_bits(dst->node.start);
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = lower_32_bits(src->node.start);
+		*cs++ = upper_32_bits(src->node.start);
+	} else if (gen >= 8) {
+		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10-2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = src->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = lower_32_bits(dst->node.start);
+		*cs++ = upper_32_bits(dst->node.start);
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = lower_32_bits(src->node.start);
+		*cs++ = upper_32_bits(src->node.start);
+	} else {
+		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (8-2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = src->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = dst->node.start;
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = src->node.start;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
+			     struct drm_i915_gem_object *dst,
+			     struct intel_context *ce)
+{
+	struct drm_i915_private *i915 = to_i915(src->base.dev);
+	struct i915_gem_context *ctx = ce->gem_context;
+	struct i915_address_space *vm = ctx->vm ?: &i915->ggtt.vm;
+	struct drm_gem_object *objs[] = { &src->base, &dst->base };
+	struct ww_acquire_ctx acquire;
+	struct i915_vma *vma_src, *vma_dst;
+	struct i915_request *rq;
+	int err;
+
+	vma_src = i915_vma_instance(src, vm, NULL);
+	if (IS_ERR(vma_src))
+		return PTR_ERR(vma_src);
+
+	err = i915_vma_pin(vma_src, 0, 0, PIN_USER);
+	if (unlikely(err))
+		return err;
+
+	vma_dst = i915_vma_instance(dst, vm, NULL);
+	if (IS_ERR(vma_dst))
+		goto out_unpin_src;
+
+	err = i915_vma_pin(vma_dst, 0, 0, PIN_USER);
+	if (unlikely(err))
+		goto out_unpin_src;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_unpin_dst;
+	}
+
+	err = drm_gem_lock_reservations(objs, ARRAY_SIZE(objs), &acquire);
+	if (unlikely(err))
+		goto out_request;
+
+	if (src->cache_dirty & ~src->cache_coherent)
+		i915_gem_clflush_object(src, 0);
+
+	if (dst->cache_dirty & ~dst->cache_coherent)
+		i915_gem_clflush_object(dst, 0);
+
+	err = i915_request_await_object(rq, src, false);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = i915_vma_move_to_active(vma_src, rq, 0);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = i915_request_await_object(rq, dst, true);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = i915_vma_move_to_active(vma_dst, rq, EXEC_OBJECT_WRITE);
+	if (unlikely(err))
+		goto out_unlock;
+
+	if (ce->engine->emit_init_breadcrumb) {
+		err = ce->engine->emit_init_breadcrumb(rq);
+		if (unlikely(err))
+			goto out_unlock;
+	}
+
+	err = intel_emit_vma_copy_blt(rq, vma_src, vma_dst);
+out_unlock:
+	drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire);
+out_request:
+	if (unlikely(err))
+		i915_request_skip(rq, err);
+
+	i915_request_add(rq);
+out_unpin_dst:
+	i915_vma_unpin(vma_dst);
+out_unpin_src:
+	i915_vma_unpin(vma_src);
+	return err;
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/i915_gem_object_blt.c"
 #endif
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h
index 7ec7de6ac0c0..17fac835f391 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.h
@@ -21,4 +21,12 @@  int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
 			     struct intel_context *ce,
 			     u32 value);
 
+int intel_emit_vma_copy_blt(struct i915_request *rq,
+			    struct i915_vma *src,
+			    struct i915_vma *dst);
+
+int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
+			     struct drm_i915_gem_object *dst,
+			     struct intel_context *ce);
+
 #endif
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c
index e23d8c9e9298..1f28a12f7bb4 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_object_blt.c
@@ -94,10 +94,115 @@  static int igt_fill_blt(void *arg)
 	return err;
 }
 
+static int igt_copy_blt(void *arg)
+{
+	struct intel_context *ce = arg;
+	struct drm_i915_private *i915 = ce->gem_context->i915;
+	struct drm_i915_gem_object *src, *dst;
+	struct rnd_state prng;
+	IGT_TIMEOUT(end);
+	u32 *vaddr;
+	int err = 0;
+
+	prandom_seed_state(&prng, i915_selftest.random_seed);
+
+	do {
+		u32 sz = prandom_u32_state(&prng) % SZ_32M;
+		u32 val = prandom_u32_state(&prng);
+		u32 i;
+
+		sz = round_up(sz, PAGE_SIZE);
+
+		pr_debug("%s with sz=%x, val=%x\n", __func__, sz, val);
+
+		src = i915_gem_object_create_internal(i915, sz);
+		if (IS_ERR(src)) {
+			err = PTR_ERR(vaddr);
+			goto err_flush;
+		}
+
+		vaddr = i915_gem_object_pin_map(src, I915_MAP_WB);
+		if (IS_ERR(vaddr)) {
+			err = PTR_ERR(vaddr);
+			goto err_put_src;
+		}
+
+		memset32(vaddr, val, src->base.size / sizeof(u32));
+
+		i915_gem_object_unpin_map(src);
+
+		if (!(src->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
+			src->cache_dirty = true;
+
+		dst = i915_gem_object_create_internal(i915, sz);
+		if (IS_ERR(dst)) {
+			err = PTR_ERR(vaddr);
+			goto err_put_src;
+		}
+
+		vaddr = i915_gem_object_pin_map(dst, I915_MAP_WB);
+		if (IS_ERR(vaddr)) {
+			err = PTR_ERR(vaddr);
+			goto err_put_dst;
+		}
+
+		memset32(vaddr, val ^ 0xdeadbeaf, dst->base.size / sizeof(u32));
+
+		if (!(dst->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
+			dst->cache_dirty = true;
+
+		mutex_lock(&i915->drm.struct_mutex);
+		err = i915_gem_object_copy_blt(src, dst, ce);
+		mutex_unlock(&i915->drm.struct_mutex);
+		if (err)
+			goto err_unpin;
+
+		i915_gem_object_lock(dst);
+		err = i915_gem_object_set_to_cpu_domain(dst, false);
+		i915_gem_object_unlock(dst);
+		if (err)
+			goto err_unpin;
+
+		for (i = 0; i < dst->base.size / sizeof(u32); ++i) {
+			if (vaddr[i] != val) {
+				pr_err("vaddr[%u]=%x, expected=%x\n", i,
+				       vaddr[i], val);
+				err = -EINVAL;
+				goto err_unpin;
+			}
+		}
+
+		i915_gem_object_unpin_map(dst);
+
+		i915_gem_object_put(src);
+		i915_gem_object_put(dst);
+	} while (!time_after(jiffies, end));
+
+	goto err_flush;
+
+err_unpin:
+	i915_gem_object_unpin_map(dst);
+err_put_dst:
+	i915_gem_object_put(dst);
+err_put_src:
+	i915_gem_object_put(src);
+err_flush:
+	mutex_lock(&i915->drm.struct_mutex);
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	if (err == -ENOMEM)
+		err = 0;
+
+	return err;
+}
+
 int i915_gem_object_blt_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
 		SUBTEST(igt_fill_blt),
+		SUBTEST(igt_copy_blt),
 	};
 
 	if (i915_terminally_wedged(i915))
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index eec31e36aca7..e3b23351669c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -182,7 +182,8 @@ 
 #define COLOR_BLT_CMD			(2<<29 | 0x40<<22 | (5-2))
 #define XY_COLOR_BLT_CMD		(2 << 29 | 0x50 << 22)
 #define SRC_COPY_BLT_CMD		((2<<29)|(0x43<<22)|4)
-#define XY_SRC_COPY_BLT_CMD		((2<<29)|(0x53<<22)|6)
+#define GEN9_XY_FAST_COPY_BLT_CMD	((2<<29)|(0x42<<22))
+#define XY_SRC_COPY_BLT_CMD		((2<<29)|(0x53<<22))
 #define XY_MONO_SRC_COPY_IMM_BLT	((2<<29)|(0x71<<22)|5)
 #define   BLT_WRITE_A			(2<<20)
 #define   BLT_WRITE_RGB			(1<<20)