diff mbox

[18/21] drm/i915: Embed signaling node into the GEM request

Message ID 1464970133-29859-19-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 3, 2016, 4:08 p.m. UTC
Under the assumption that enabling signaling will be a frequent
operation, lets preallocate our attachments for signaling inside the
request struct (and so benefiting from the slab cache).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h          |  1 +
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 89 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  6 +++
 3 files changed, 56 insertions(+), 40 deletions(-)

Comments

Tvrtko Ursulin June 7, 2016, 12:31 p.m. UTC | #1
On 03/06/16 17:08, Chris Wilson wrote:
> Under the assumption that enabling signaling will be a frequent
> operation, lets preallocate our attachments for signaling inside the
> request struct (and so benefiting from the slab cache).

Oh you did this part which I suggested in the previous patch. :)

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h          |  1 +
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 89 ++++++++++++++++++--------------
>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  6 +++
>   3 files changed, 56 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b0235372cfdf..88d9242398ce 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2363,6 +2363,7 @@ struct drm_i915_gem_request {
>   	struct drm_i915_private *i915;
>   	struct intel_engine_cs *engine;
>   	unsigned reset_counter;
> +	struct intel_signal_node signaling;
>
>   	 /** GEM sequence number associated with the previous request,
>   	  * when the HWS breadcrumb is equal to this the GPU is processing
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 143891a2b68a..8ab508ed4248 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -128,16 +128,14 @@ static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
>   	wake_up_process(wait->task); /* implicit smp_wmb() */
>   }
>
> -bool intel_engine_add_wait(struct intel_engine_cs *engine,
> -			   struct intel_wait *wait)
> +static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
> +				    struct intel_wait *wait)
>   {
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
>   	struct rb_node **p, *parent, *completed;
>   	bool first;
>   	u32 seqno;
>
> -	spin_lock(&b->lock);
> -
>   	/* Insert the request into the retirement ordered list
>   	 * of waiters by walking the rbtree. If we are the oldest
>   	 * seqno in the tree (the first to be retired), then
> @@ -223,6 +221,17 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine,
>   	GEM_BUG_ON(!b->first_wait);
>   	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
>
> +	return first;
> +}
> +
> +bool intel_engine_add_wait(struct intel_engine_cs *engine,
> +			   struct intel_wait *wait)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +	bool first;
> +
> +	spin_lock(&b->lock);
> +	first = __intel_engine_add_wait(engine, wait);
>   	spin_unlock(&b->lock);
>
>   	return first;
> @@ -323,35 +332,29 @@ out_unlock:
>   	spin_unlock(&b->lock);
>   }
>
> -struct signal {
> -	struct rb_node node;
> -	struct intel_wait wait;
> -	struct drm_i915_gem_request *request;
> -};
> -
> -static bool signal_complete(struct signal *signal)
> +static bool signal_complete(struct drm_i915_gem_request *request)
>   {
> -	if (signal == NULL)
> +	if (request == NULL)
>   		return false;
>
>   	/* If another process served as the bottom-half it may have already
>   	 * signalled that this wait is already completed.
>   	 */
> -	if (intel_wait_complete(&signal->wait))
> +	if (intel_wait_complete(&request->signaling.wait))
>   		return true;
>
>   	/* Carefully check if the request is complete, giving time for the
>   	 * seqno to be visible or if the GPU hung.
>   	 */
> -	if (__i915_request_irq_complete(signal->request))
> +	if (__i915_request_irq_complete(request))
>   		return true;
>
>   	return false;
>   }
>
> -static struct signal *to_signal(struct rb_node *rb)
> +static struct drm_i915_gem_request *to_signal(struct rb_node *rb)

Why it is call to_signal then?

>   {
> -	return container_of(rb, struct signal, node);
> +	return container_of(rb, struct drm_i915_gem_request, signaling.node);
>   }
>
>   static void signaler_set_rtpriority(void)
> @@ -364,7 +367,7 @@ static int intel_breadcrumbs_signaler(void *arg)
>   {
>   	struct intel_engine_cs *engine = arg;
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> -	struct signal *signal;
> +	struct drm_i915_gem_request *request;
>
>   	/* Install ourselves with high priority to reduce signalling latency */
>   	signaler_set_rtpriority();
> @@ -380,14 +383,13 @@ static int intel_breadcrumbs_signaler(void *arg)
>   		 * need to wait for a new interrupt from the GPU or for
>   		 * a new client.
>   		 */
> -		signal = READ_ONCE(b->first_signal);
> -		if (signal_complete(signal)) {
> +		request = READ_ONCE(b->first_signal);
> +		if (signal_complete(request)) {
>   			/* Wake up all other completed waiters and select the
>   			 * next bottom-half for the next user interrupt.
>   			 */
> -			intel_engine_remove_wait(engine, &signal->wait);
> -
> -			i915_gem_request_unreference(signal->request);
> +			intel_engine_remove_wait(engine,
> +						 &request->signaling.wait);
>
>   			/* Find the next oldest signal. Note that as we have
>   			 * not been holding the lock, another client may
> @@ -396,12 +398,15 @@ static int intel_breadcrumbs_signaler(void *arg)
>   			 * the oldest before picking the next one.
>   			 */
>   			spin_lock(&b->lock);
> -			if (signal == b->first_signal)
> -				b->first_signal = rb_next(&signal->node);
> -			rb_erase(&signal->node, &b->signals);
> +			if (request == b->first_signal) {
> +				struct rb_node *rb =
> +					rb_next(&request->signaling.node);
> +				b->first_signal = rb ? to_signal(rb) : NULL;

Made me look in the previous patch on how you didn't need to change the 
type for first_signal in this one. void* ! :) Please fix there. :)

> +			}
> +			rb_erase(&request->signaling.node, &b->signals);
>   			spin_unlock(&b->lock);
>
> -			kfree(signal);
> +			i915_gem_request_unreference(request);
>   		} else {
>   			if (kthread_should_stop())
>   				break;
> @@ -418,20 +423,23 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request)
>   	struct intel_engine_cs *engine = request->engine;
>   	struct intel_breadcrumbs *b = &engine->breadcrumbs;
>   	struct rb_node *parent, **p;
> -	struct signal *signal;
>   	bool first, wakeup;
>
>   	if (unlikely(IS_ERR(b->signaler)))
>   		return PTR_ERR(b->signaler);
>
> -	signal = kmalloc(sizeof(*signal), GFP_ATOMIC);
> -	if (unlikely(!signal))
> -		return -ENOMEM;
> +	if (unlikely(READ_ONCE(request->signaling.wait.task)))
> +		return 0;

Hmm it will depend on following patches whether this is safe. I don't 
like the explosion of READ_ONCE and smp_store_mb's in these patches. 
Something is bound to be broken.

You even check it below under the lock. So I am not sure this 
optimisation is worth it. Maybe leave it for later?

>
> -	signal->wait.task = b->signaler;
> -	signal->wait.seqno = request->seqno;
> +	spin_lock(&b->lock);
> +	if (unlikely(request->signaling.wait.task)) {
> +		wakeup = false;
> +		goto unlock;
> +	}
>
> -	signal->request = i915_gem_request_reference(request);
> +	request->signaling.wait.task = b->signaler;
> +	request->signaling.wait.seqno = request->seqno;
> +	i915_gem_request_reference(request);
>
>   	/* First add ourselves into the list of waiters, but register our
>   	 * bottom-half as the signaller thread. As per usual, only the oldest
> @@ -441,29 +449,30 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request)
>   	 * If we are the oldest waiter, enable the irq (after which we
>   	 * must double check that the seqno did not complete).
>   	 */
> -	wakeup = intel_engine_add_wait(engine, &signal->wait);
> +	wakeup = __intel_engine_add_wait(engine, &request->signaling.wait);
>
>   	/* Now insert ourselves into the retirement ordered list of signals
>   	 * on this engine. We track the oldest seqno as that will be the
>   	 * first signal to complete.
>   	 */
> -	spin_lock(&b->lock);
>   	parent = NULL;
>   	first = true;
>   	p = &b->signals.rb_node;
>   	while (*p) {
>   		parent = *p;
> -		if (i915_seqno_passed(signal->wait.seqno,
> -				      to_signal(parent)->wait.seqno)) {
> +		if (i915_seqno_passed(request->seqno,
> +				      to_signal(parent)->seqno)) {
>   			p = &parent->rb_right;
>   			first = false;
>   		} else
>   			p = &parent->rb_left;
>   	}
> -	rb_link_node(&signal->node, parent, p);
> -	rb_insert_color(&signal->node, &b->signals);
> +	rb_link_node(&request->signaling.node, parent, p);
> +	rb_insert_color(&request->signaling.node, &b->signals);
>   	if (first)
> -		smp_store_mb(b->first_signal, signal);
> +		smp_store_mb(b->first_signal, request);
> +
> +unlock:
>   	spin_unlock(&b->lock);
>
>   	if (wakeup)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index f4bca38caef0..5f7cb3d0ea1c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -530,6 +530,12 @@ struct intel_wait {
>   	struct task_struct *task;
>   	u32 seqno;
>   };
> +
> +struct intel_signal_node {
> +	struct rb_node node;
> +	struct intel_wait wait;
> +};
> +
>   void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
>   static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
>   {
>

Otherwise looks OK.

Regards,

Tvrtko
Chris Wilson June 8, 2016, 9:54 a.m. UTC | #2
On Tue, Jun 07, 2016 at 01:31:25PM +0100, Tvrtko Ursulin wrote:
> >@@ -418,20 +423,23 @@ int intel_engine_enable_signaling(struct drm_i915_gem_request *request)
> >  	struct intel_engine_cs *engine = request->engine;
> >  	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> >  	struct rb_node *parent, **p;
> >-	struct signal *signal;
> >  	bool first, wakeup;
> >
> >  	if (unlikely(IS_ERR(b->signaler)))
> >  		return PTR_ERR(b->signaler);
> >
> >-	signal = kmalloc(sizeof(*signal), GFP_ATOMIC);
> >-	if (unlikely(!signal))
> >-		return -ENOMEM;
> >+	if (unlikely(READ_ONCE(request->signaling.wait.task)))
> >+		return 0;
> 
> Hmm it will depend on following patches whether this is safe. I
> don't like the explosion of READ_ONCE and smp_store_mb's in these
> patches. Something is bound to be broken.

This one is trivial :)
 
> You even check it below under the lock. So I am not sure this
> optimisation is worth it. Maybe leave it for later?

I was just worrying about contention since the breadcrumbs lock is being
used to guard both trees and contention in the waiters is noticeable.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b0235372cfdf..88d9242398ce 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2363,6 +2363,7 @@  struct drm_i915_gem_request {
 	struct drm_i915_private *i915;
 	struct intel_engine_cs *engine;
 	unsigned reset_counter;
+	struct intel_signal_node signaling;
 
 	 /** GEM sequence number associated with the previous request,
 	  * when the HWS breadcrumb is equal to this the GPU is processing
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 143891a2b68a..8ab508ed4248 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -128,16 +128,14 @@  static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
 	wake_up_process(wait->task); /* implicit smp_wmb() */
 }
 
-bool intel_engine_add_wait(struct intel_engine_cs *engine,
-			   struct intel_wait *wait)
+static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
+				    struct intel_wait *wait)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 	struct rb_node **p, *parent, *completed;
 	bool first;
 	u32 seqno;
 
-	spin_lock(&b->lock);
-
 	/* Insert the request into the retirement ordered list
 	 * of waiters by walking the rbtree. If we are the oldest
 	 * seqno in the tree (the first to be retired), then
@@ -223,6 +221,17 @@  bool intel_engine_add_wait(struct intel_engine_cs *engine,
 	GEM_BUG_ON(!b->first_wait);
 	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
 
+	return first;
+}
+
+bool intel_engine_add_wait(struct intel_engine_cs *engine,
+			   struct intel_wait *wait)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+	bool first;
+
+	spin_lock(&b->lock);
+	first = __intel_engine_add_wait(engine, wait);
 	spin_unlock(&b->lock);
 
 	return first;
@@ -323,35 +332,29 @@  out_unlock:
 	spin_unlock(&b->lock);
 }
 
-struct signal {
-	struct rb_node node;
-	struct intel_wait wait;
-	struct drm_i915_gem_request *request;
-};
-
-static bool signal_complete(struct signal *signal)
+static bool signal_complete(struct drm_i915_gem_request *request)
 {
-	if (signal == NULL)
+	if (request == NULL)
 		return false;
 
 	/* If another process served as the bottom-half it may have already
 	 * signalled that this wait is already completed.
 	 */
-	if (intel_wait_complete(&signal->wait))
+	if (intel_wait_complete(&request->signaling.wait))
 		return true;
 
 	/* Carefully check if the request is complete, giving time for the
 	 * seqno to be visible or if the GPU hung.
 	 */
-	if (__i915_request_irq_complete(signal->request))
+	if (__i915_request_irq_complete(request))
 		return true;
 
 	return false;
 }
 
-static struct signal *to_signal(struct rb_node *rb)
+static struct drm_i915_gem_request *to_signal(struct rb_node *rb)
 {
-	return container_of(rb, struct signal, node);
+	return container_of(rb, struct drm_i915_gem_request, signaling.node);
 }
 
 static void signaler_set_rtpriority(void)
@@ -364,7 +367,7 @@  static int intel_breadcrumbs_signaler(void *arg)
 {
 	struct intel_engine_cs *engine = arg;
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct signal *signal;
+	struct drm_i915_gem_request *request;
 
 	/* Install ourselves with high priority to reduce signalling latency */
 	signaler_set_rtpriority();
@@ -380,14 +383,13 @@  static int intel_breadcrumbs_signaler(void *arg)
 		 * need to wait for a new interrupt from the GPU or for
 		 * a new client.
 		 */
-		signal = READ_ONCE(b->first_signal);
-		if (signal_complete(signal)) {
+		request = READ_ONCE(b->first_signal);
+		if (signal_complete(request)) {
 			/* Wake up all other completed waiters and select the
 			 * next bottom-half for the next user interrupt.
 			 */
-			intel_engine_remove_wait(engine, &signal->wait);
-
-			i915_gem_request_unreference(signal->request);
+			intel_engine_remove_wait(engine,
+						 &request->signaling.wait);
 
 			/* Find the next oldest signal. Note that as we have
 			 * not been holding the lock, another client may
@@ -396,12 +398,15 @@  static int intel_breadcrumbs_signaler(void *arg)
 			 * the oldest before picking the next one.
 			 */
 			spin_lock(&b->lock);
-			if (signal == b->first_signal)
-				b->first_signal = rb_next(&signal->node);
-			rb_erase(&signal->node, &b->signals);
+			if (request == b->first_signal) {
+				struct rb_node *rb =
+					rb_next(&request->signaling.node);
+				b->first_signal = rb ? to_signal(rb) : NULL;
+			}
+			rb_erase(&request->signaling.node, &b->signals);
 			spin_unlock(&b->lock);
 
-			kfree(signal);
+			i915_gem_request_unreference(request);
 		} else {
 			if (kthread_should_stop())
 				break;
@@ -418,20 +423,23 @@  int intel_engine_enable_signaling(struct drm_i915_gem_request *request)
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 	struct rb_node *parent, **p;
-	struct signal *signal;
 	bool first, wakeup;
 
 	if (unlikely(IS_ERR(b->signaler)))
 		return PTR_ERR(b->signaler);
 
-	signal = kmalloc(sizeof(*signal), GFP_ATOMIC);
-	if (unlikely(!signal))
-		return -ENOMEM;
+	if (unlikely(READ_ONCE(request->signaling.wait.task)))
+		return 0;
 
-	signal->wait.task = b->signaler;
-	signal->wait.seqno = request->seqno;
+	spin_lock(&b->lock);
+	if (unlikely(request->signaling.wait.task)) {
+		wakeup = false;
+		goto unlock;
+	}
 
-	signal->request = i915_gem_request_reference(request);
+	request->signaling.wait.task = b->signaler;
+	request->signaling.wait.seqno = request->seqno;
+	i915_gem_request_reference(request);
 
 	/* First add ourselves into the list of waiters, but register our
 	 * bottom-half as the signaller thread. As per usual, only the oldest
@@ -441,29 +449,30 @@  int intel_engine_enable_signaling(struct drm_i915_gem_request *request)
 	 * If we are the oldest waiter, enable the irq (after which we
 	 * must double check that the seqno did not complete).
 	 */
-	wakeup = intel_engine_add_wait(engine, &signal->wait);
+	wakeup = __intel_engine_add_wait(engine, &request->signaling.wait);
 
 	/* Now insert ourselves into the retirement ordered list of signals
 	 * on this engine. We track the oldest seqno as that will be the
 	 * first signal to complete.
 	 */
-	spin_lock(&b->lock);
 	parent = NULL;
 	first = true;
 	p = &b->signals.rb_node;
 	while (*p) {
 		parent = *p;
-		if (i915_seqno_passed(signal->wait.seqno,
-				      to_signal(parent)->wait.seqno)) {
+		if (i915_seqno_passed(request->seqno,
+				      to_signal(parent)->seqno)) {
 			p = &parent->rb_right;
 			first = false;
 		} else
 			p = &parent->rb_left;
 	}
-	rb_link_node(&signal->node, parent, p);
-	rb_insert_color(&signal->node, &b->signals);
+	rb_link_node(&request->signaling.node, parent, p);
+	rb_insert_color(&request->signaling.node, &b->signals);
 	if (first)
-		smp_store_mb(b->first_signal, signal);
+		smp_store_mb(b->first_signal, request);
+
+unlock:
 	spin_unlock(&b->lock);
 
 	if (wakeup)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index f4bca38caef0..5f7cb3d0ea1c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -530,6 +530,12 @@  struct intel_wait {
 	struct task_struct *task;
 	u32 seqno;
 };
+
+struct intel_signal_node {
+	struct rb_node node;
+	struct intel_wait wait;
+};
+
 void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
 static inline void intel_wait_init(struct intel_wait *wait, u32 seqno)
 {