diff mbox

[v2,3/6] drm/i915/guc: use a separate GuC client for each engine

Message ID 1469124942-17943-4-git-send-email-david.s.gordon@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Gordon July 21, 2016, 6:15 p.m. UTC
When using a single GuC client for multiple engines, the i915 driver has
to merge all work items into a single work queue, which the GuC firmware
then demultiplexes into separate submission queues per engine. In
theory, this could lead to the single queue becoming a bottleneck in
which an excess of outstanding work for one or more engines might
prevent work for an idle engine reaching the hardware.

To reduce this risk, we can create one GuC client per engine. Each will
have its own workqueue, to be used only for work targeting a single
engine, so there will be no cross-engine contention for workqueue slots.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        | 25 ++++++++++++++++-----
 drivers/gpu/drm/i915/i915_guc_submission.c | 35 +++++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_guc.h           |  2 +-
 3 files changed, 42 insertions(+), 20 deletions(-)

Comments

Chris Wilson July 21, 2016, 6:30 p.m. UTC | #1
On Thu, Jul 21, 2016 at 07:15:39PM +0100, Dave Gordon wrote:
> When using a single GuC client for multiple engines, the i915 driver has
> to merge all work items into a single work queue, which the GuC firmware
> then demultiplexes into separate submission queues per engine. In
> theory, this could lead to the single queue becoming a bottleneck in
> which an excess of outstanding work for one or more engines might
> prevent work for an idle engine reaching the hardware.
> 
> To reduce this risk, we can create one GuC client per engine. Each will
> have its own workqueue, to be used only for work targeting a single
> engine, so there will be no cross-engine contention for workqueue slots.
> 
> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Does guc_context_desc.engines_used have any effect?
-Chris
Dave Gordon July 22, 2016, 10:23 a.m. UTC | #2
On 21/07/16 19:30, Chris Wilson wrote:
> On Thu, Jul 21, 2016 at 07:15:39PM +0100, Dave Gordon wrote:
>> When using a single GuC client for multiple engines, the i915 driver has
>> to merge all work items into a single work queue, which the GuC firmware
>> then demultiplexes into separate submission queues per engine. In
>> theory, this could lead to the single queue becoming a bottleneck in
>> which an excess of outstanding work for one or more engines might
>> prevent work for an idle engine reaching the hardware.
>>
>> To reduce this risk, we can create one GuC client per engine. Each will
>> have its own workqueue, to be used only for work targeting a single
>> engine, so there will be no cross-engine contention for workqueue slots.
>>
>> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
>> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> Does guc_context_desc.engines_used have any effect?
> -Chris

Yes, some of the firmware code uses it to optimise which queues it scans 
at certain times. If it knows that a certain queue *doesn't* contain 
work for a given engine, it can skip scanning that queue entirely.

Does this patchset change the results in the parallel-submission nop test?

.Dave.
Chris Wilson July 22, 2016, 9:57 p.m. UTC | #3
On Fri, Jul 22, 2016 at 11:23:11AM +0100, Dave Gordon wrote:
> Does this patchset change the results in the parallel-submission nop test?

No.

Using Execlists submission
Maximum execution latency on render, 3.392us, total 7.794us per cycle
All (4 engines): 8,257,536 cycles, average 1.211us per cycle

default: 5,789,696 cycles: 3.455us
render: 5,748,736 cycles: 3.479us
bsd: 13,406,208 cycles: 1.492us
blt: 13,852,672 cycles: 1.444us
vebox: 13,680,640 cycles: 1.462us

Using GuC submission (v6.1)
Maximum execution latency on render, 3.674us, total 9.089us per cycle
All (4 engines): 577,536 cycles, average 17.441us per cycle

default: 5,399,552 cycles: 3.704us
render: 5,341,184 cycles: 3.745us
bsd: 13,060,096 cycles: 1.531us
blt: 8,828,928 cycles: 2.265us
vebox: 12,945,408 cycles: 1.545us
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 9aa62c5..793b1d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2564,20 +2564,26 @@  static int i915_guc_info(struct seq_file *m, void *data)
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct intel_guc guc;
-	struct i915_guc_client client = {};
+	struct i915_guc_client *clients;
 	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	u64 total = 0;
 
 	if (!HAS_GUC_SCHED(dev_priv))
 		return 0;
 
+	clients = kcalloc(I915_NUM_ENGINES, sizeof(*clients), GFP_KERNEL);
+	if (clients == NULL)
+		return -ENOMEM;
+
 	if (mutex_lock_interruptible(&dev->struct_mutex))
-		return 0;
+		goto done;
 
 	/* Take a local copy of the GuC data, so we can dump it at leisure */
 	guc = dev_priv->guc;
-	if (guc.execbuf_client)
-		client = *guc.execbuf_client;
+	for_each_engine_id(engine, dev_priv, id)
+		if (guc.exec_clients[id])
+			clients[id] = *guc.exec_clients[id];
 
 	mutex_unlock(&dev->struct_mutex);
 
@@ -2600,11 +2606,18 @@  static int i915_guc_info(struct seq_file *m, void *data)
 	}
 	seq_printf(m, "\t%s: %llu\n", "Total", total);
 
-	seq_printf(m, "\nGuC execbuf client @ %p:\n", guc.execbuf_client);
-	i915_guc_client_info(m, dev_priv, &client);
+	for_each_engine_id(engine, dev_priv, id) {
+		seq_printf(m, "\nGuC exec_client[%d] @ %p:\n",
+				id, guc.exec_clients[id]);
+		if (guc.exec_clients[id])
+			i915_guc_client_info(m, dev_priv, &clients[id]);
+	}
 
 	/* Add more as required ... */
 
+done:
+	kfree(clients);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 816bdca..9999797 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -434,7 +434,9 @@  static void guc_fini_ctx_desc(struct intel_guc *guc,
 int i915_guc_wq_check_space(struct drm_i915_gem_request *request)
 {
 	const size_t wqi_size = sizeof(struct guc_wq_item);
-	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
+	enum intel_engine_id engine_id = request->engine->id;
+	struct intel_guc *guc = &request->i915->guc;
+	struct i915_guc_client *gc = guc->exec_clients[engine_id];
 	struct guc_process_desc *desc;
 	u32 freespace;
 
@@ -589,7 +591,7 @@  int i915_guc_submit(struct drm_i915_gem_request *rq)
 {
 	unsigned int engine_id = rq->engine->id;
 	struct intel_guc *guc = &rq->i915->guc;
-	struct i915_guc_client *client = guc->execbuf_client;
+	struct i915_guc_client *client = guc->exec_clients[engine_id];
 	int b_ret;
 
 	guc_add_workqueue_item(client, rq);
@@ -723,7 +725,7 @@  static bool guc_doorbell_check(struct intel_guc *guc, uint16_t db_id)
  */
 static void guc_init_doorbell_hw(struct intel_guc *guc)
 {
-	struct i915_guc_client *client = guc->execbuf_client;
+	struct i915_guc_client *client = guc->exec_clients[RCS];
 	uint16_t db_id;
 	int i, err;
 
@@ -1004,17 +1006,21 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc *guc = &dev_priv->guc;
 	struct i915_guc_client *client;
+	struct intel_engine_cs *engine;
 
-	/* client for execbuf submission */
-	client = guc_client_alloc(dev_priv,
-				  GUC_CTX_PRIORITY_KMD_NORMAL,
-				  dev_priv->kernel_context);
-	if (!client) {
-		DRM_ERROR("Failed to create execbuf guc_client\n");
-		return -ENOMEM;
+	for_each_engine(engine, dev_priv) {
+		/* client for execbuf submission */
+		client = guc_client_alloc(dev_priv,
+					  GUC_CTX_PRIORITY_KMD_NORMAL,
+					  dev_priv->kernel_context);
+		if (!client) {
+			DRM_ERROR("Failed to create GuC client(s)\n");
+			return -ENOMEM;
+		}
+
+		guc->exec_clients[engine->id] = client;
 	}
 
-	guc->execbuf_client = client;
 	host2guc_sample_forcewake(guc, client);
 	guc_init_doorbell_hw(guc);
 
@@ -1024,9 +1030,12 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 void i915_guc_submission_disable(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc *guc = &dev_priv->guc;
+	struct intel_engine_cs *engine;
 
-	guc_client_free(dev_priv, guc->execbuf_client);
-	guc->execbuf_client = NULL;
+	for_each_engine(engine, dev_priv) {
+		guc_client_free(dev_priv, guc->exec_clients[engine->id]);
+		guc->exec_clients[engine->id] = NULL;
+	}
 }
 
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 3e3e743..7b4cc4d 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -132,7 +132,7 @@  struct intel_guc {
 	struct drm_i915_gem_object *ctx_pool_obj;
 	struct ida ctx_ids;
 
-	struct i915_guc_client *execbuf_client;
+	struct i915_guc_client *exec_clients[I915_NUM_ENGINES];
 
 	DECLARE_BITMAP(doorbell_bitmap, GUC_MAX_DOORBELLS);
 	uint32_t db_cacheline;		/* Cyclic counter mod pagesize	*/