diff mbox

[v6,32/34] drm/i915: Allow scheduler to manage inter-ring object synchronisation

Message ID 1461172435-4256-33-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison April 20, 2016, 5:13 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The scheduler has always tracked batch buffer dependencies based on
DRM object usage. This means that it will not submit a batch on one
ring that has outstanding dependencies still executing on other rings.
This is exactly the same synchronisation performed by
i915_gem_object_sync() using hardware semaphores where available and
CPU stalls where not (e.g. in execlist mode and/or on Gen8 hardware).

Unfortunately, when a batch buffer is submitted to the driver the
_object_sync() call happens first. Thus in case where hardware
semaphores are disabled, the driver has already stalled until the
dependency has been resolved.

This patch adds an optimisation to _object_sync() to ignore the
synchronisation in the case where it will subsequently be handled by
the scheduler. This removes the driver stall and (in the single
application case) provides near hardware semaphore performance even
when hardware semaphores are disabled. In a busy system where there is
other work that can be executed on the stalling ring, it provides
better than hardware semaphore performance as it removes the stall
from both the driver and from the hardware. There is also a theory
that this method should improve power usage as hardware semaphores are
apparently not very power efficient - the stalled ring does not go
into as low a power a state as when it is genuinely idle.

The optimisation is to check whether both ends of the synchronisation
are batch buffer requests. If they are, then the scheduler will have
the inter-dependency tracked and managed. If one or other end is not a
batch buffer request (e.g. a page flip) then the code falls back to
the CPU stall or hardware semaphore as appropriate.

To check whether the existing usage is a batch buffer, the code simply
calls the 'are you tracking this request' function of the scheduler on
the object's last_read_req member. To check whether the new usage is a
batch buffer, a flag is passed in from the caller.

v6: Updated to newer nightly (lots of ring -> engine renaming).

Replaced the i915_scheduler_is_request_tracked() function with
i915_scheduler_is_request_batch_buffer() as the need for the former
has gone away and it was really being used to ask the latter question
in a convoluted manner. [review feedback from Joonas Lahtinen]

Issue: VIZ-5566
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  2 +-
 drivers/gpu/drm/i915/i915_gem.c            | 16 +++++++++++++---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  2 +-
 drivers/gpu/drm/i915/i915_scheduler.c      | 19 +++++++++++++++++++
 drivers/gpu/drm/i915/i915_scheduler.h      |  1 +
 drivers/gpu/drm/i915/intel_display.c       |  2 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  2 +-
 7 files changed, 37 insertions(+), 7 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4d5d059..cb346ab 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3101,7 +3101,7 @@  static inline void i915_gem_object_unpin_map(struct drm_i915_gem_object *obj)
 int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
 int i915_gem_object_sync(struct drm_i915_gem_object *obj,
 			 struct intel_engine_cs *to,
-			 struct drm_i915_gem_request **to_req);
+			 struct drm_i915_gem_request **to_req, bool to_batch);
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req);
 int i915_gem_dumb_create(struct drm_file *file_priv,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 467d7da..818113d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3658,7 +3658,7 @@  static int
 __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 		       struct intel_engine_cs *to,
 		       struct drm_i915_gem_request *from_req,
-		       struct drm_i915_gem_request **to_req)
+		       struct drm_i915_gem_request **to_req, bool to_batch)
 {
 	struct intel_engine_cs *from;
 	int ret;
@@ -3670,6 +3670,14 @@  __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 	if (i915_gem_request_completed(from_req))
 		return 0;
 
+	/*
+	 * The scheduler will manage inter-ring object dependencies
+	 * as long as both to and from requests are scheduler managed
+	 * (i.e. batch buffers).
+	 */
+	if (to_batch && i915_scheduler_is_request_batch_buffer(from_req))
+		return 0;
+
 	if (!i915_semaphore_is_enabled(obj->base.dev)) {
 		struct drm_i915_private *i915 = to_i915(obj->base.dev);
 		uint32_t flags;
@@ -3728,6 +3736,8 @@  __i915_gem_object_sync(struct drm_i915_gem_object *obj,
  * @to_req: request we wish to use the object for. See below.
  *          This will be allocated and returned if a request is
  *          required but not passed in.
+ * @to_batch: is the sync request on behalf of batch buffer submission?
+ * If so then the scheduler can (potentially) manage the synchronisation.
  *
  * This code is meant to abstract object synchronization with the GPU.
  * Calling with NULL implies synchronizing the object with the CPU
@@ -3758,7 +3768,7 @@  __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 int
 i915_gem_object_sync(struct drm_i915_gem_object *obj,
 		     struct intel_engine_cs *to,
-		     struct drm_i915_gem_request **to_req)
+		     struct drm_i915_gem_request **to_req, bool to_batch)
 {
 	const bool readonly = obj->base.pending_write_domain == 0;
 	struct drm_i915_gem_request *req[I915_NUM_ENGINES];
@@ -3780,7 +3790,7 @@  i915_gem_object_sync(struct drm_i915_gem_object *obj,
 				req[n++] = obj->last_read_req[i];
 	}
 	for (i = 0; i < n; i++) {
-		ret = __i915_gem_object_sync(obj, to, req[i], to_req);
+		ret = __i915_gem_object_sync(obj, to, req[i], to_req, to_batch);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 789f668..70ad67c 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -954,7 +954,7 @@  i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->active & other_rings) {
-			ret = i915_gem_object_sync(obj, req->engine, &req);
+			ret = i915_gem_object_sync(obj, req->engine, &req, true);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index e7377bb..fd53833 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -1344,6 +1344,25 @@  bool i915_scheduler_is_mutex_required(struct drm_i915_gem_request *req)
 }
 
 /**
+ * i915_scheduler_is_request_batch_buffer - is this request related to a
+ * batch buffer object?
+ * @req: request to be queried
+ *
+ * Returns true if the given request is for a batch buffer. False means it
+ * is for something else - page flip, context initialisation, etc.
+ */
+bool i915_scheduler_is_request_batch_buffer(struct drm_i915_gem_request *req)
+{
+	if (req->scheduler_qe == NULL)
+		return false;
+
+	if (req->scheduler_qe->params.batch_obj == NULL)
+		return false;
+
+	return true;
+}
+
+/**
  * i915_scheduler_closefile - notify the scheduler that a DRM file handle
  * has been closed.
  * @dev: DRM device
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index bdb9403..0df16e7 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -151,6 +151,7 @@  int i915_scheduler_flush(struct intel_engine_cs *engine, bool is_locked);
 int i915_scheduler_flush_stamp(struct intel_engine_cs *engine,
 			       unsigned long stamp, bool is_locked);
 bool i915_scheduler_is_mutex_required(struct drm_i915_gem_request *req);
+bool i915_scheduler_is_request_batch_buffer(struct drm_i915_gem_request *req);
 int i915_scheduler_query_stats(struct intel_engine_cs *engine,
 			       struct i915_scheduler_stats_nodes *stats);
 bool i915_scheduler_file_queue_wait(struct drm_file *file);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index f1d730a..0e5da38 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11607,7 +11607,7 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	 * into the display plane and skip any waits.
 	 */
 	if (!mmio_flip) {
-		ret = i915_gem_object_sync(obj, engine, &request);
+		ret = i915_gem_object_sync(obj, engine, &request, false);
 		if (ret)
 			goto cleanup_pending;
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 02808f7..c1263e6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -698,7 +698,7 @@  static int execlists_move_to_gpu(struct drm_i915_gem_request *req,
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (obj->active & other_rings) {
-			ret = i915_gem_object_sync(obj, req->engine, &req);
+			ret = i915_gem_object_sync(obj, req->engine, &req, true);
 			if (ret)
 				return ret;
 		}