diff mbox

[14/21] drm/i915: Only apply one barrier after a breadcrumb interrupt is posted

Message ID 1464970133-29859-15-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 3, 2016, 4:08 p.m. UTC
If we flag the seqno as potentially stale upon receiving an interrupt,
we can use that information to reduce the frequency that we apply the
heavyweight coherent seqno read (i.e. if we wake up a chain of waiters).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h          | 15 ++++++++++++++-
 drivers/gpu/drm/i915/i915_irq.c          |  1 +
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 16 ++++++++++------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  1 +
 4 files changed, 26 insertions(+), 7 deletions(-)

Comments

Tvrtko Ursulin June 6, 2016, 3:34 p.m. UTC | #1
On 03/06/16 17:08, Chris Wilson wrote:
> If we flag the seqno as potentially stale upon receiving an interrupt,
> we can use that information to reduce the frequency that we apply the
> heavyweight coherent seqno read (i.e. if we wake up a chain of waiters).
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h          | 15 ++++++++++++++-
>   drivers/gpu/drm/i915/i915_irq.c          |  1 +
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 16 ++++++++++------
>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  1 +
>   4 files changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4ddb9ff319cb..a71d08199d57 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3935,7 +3935,20 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>   	 * but it is easier and safer to do it every time the waiter
>   	 * is woken.
>   	 */
> -	if (engine->irq_seqno_barrier) {
> +	if (engine->irq_seqno_barrier && READ_ONCE(engine->irq_posted)) {
> +		/* The ordering of irq_posted versus applying the barrier
> +		 * is crucial. The clearing of the current irq_posted must
> +		 * be visible before we perform the barrier operation,
> +		 * such that if a subsequent interrupt arrives, irq_posted
> +		 * is reasserted and our task rewoken (which causes us to
> +		 * do another __i915_request_irq_complete() immediately
> +		 * and reapply the barrier). Conversely, if the clear
> +		 * occurs after the barrier, then an interrupt that arrived
> +		 * whilst we waited on the barrier would not trigger a
> +		 * barrier on the next pass, and the read may not see the
> +		 * seqno update.
> +		 */
> +		WRITE_ONCE(engine->irq_posted, false);

Why is this not smp_store_mb ?

>   		engine->irq_seqno_barrier(engine);
>   		if (i915_gem_request_completed(req))
>   			return true;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c14eb57b5807..14b3d65bb604 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -976,6 +976,7 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
>
>   static void notify_ring(struct intel_engine_cs *engine)
>   {
> +	smp_store_mb(engine->irq_posted, true);
>   	if (intel_engine_wakeup(engine)) {
>   		trace_i915_gem_request_notify(engine);
>   		engine->user_interrupts++;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> index 44346de39794..0f5fe114c204 100644
> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -43,12 +43,18 @@ static void intel_breadcrumbs_fake_irq(unsigned long data)
>
>   static void irq_enable(struct intel_engine_cs *engine)
>   {
> +	/* Enabling the IRQ may miss the generation of the interrupt, but
> +	 * we still need to force the barrier before reading the seqno,
> +	 * just in case.
> +	 */
> +	engine->irq_posted = true;

Should it be smp_store_mb here as well?

>   	WARN_ON(!engine->irq_get(engine));
>   }
>
>   static void irq_disable(struct intel_engine_cs *engine)
>   {
>   	engine->irq_put(engine);
> +	engine->irq_posted = false;
>   }
>
>   static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
> @@ -56,7 +62,6 @@ static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
>   	struct intel_engine_cs *engine =
>   		container_of(b, struct intel_engine_cs, breadcrumbs);
>   	struct drm_i915_private *i915 = engine->i915;
> -	bool irq_posted = false;
>
>   	assert_spin_locked(&b->lock);
>   	if (b->rpm_wakelock)
> @@ -72,10 +77,8 @@ static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
>
>   	/* No interrupts? Kick the waiter every jiffie! */
>   	if (intel_irqs_enabled(i915)) {
> -		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
> +		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings))
>   			irq_enable(engine);
> -			irq_posted = true;
> -		}
>   		b->irq_enabled = true;
>   	}
>
> @@ -83,7 +86,7 @@ static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
>   	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings))
>   		mod_timer(&b->fake_irq, jiffies + 1);
>
> -	return irq_posted;
> +	return READ_ONCE(engine->irq_posted);
>   }
>
>   static void __intel_breadcrumbs_disable_irq(struct intel_breadcrumbs *b)
> @@ -197,7 +200,8 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine,
>   			 * in case the seqno passed.
>   			 */
>   			__intel_breadcrumbs_enable_irq(b);
> -			wake_up_process(to_wait(next)->task);
> +			if (READ_ONCE(engine->irq_posted))

if (__intel_breadcrumbs_enable_irq(b)) ?

> +				wake_up_process(to_wait(next)->task);
>   		}
>
>   		do {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index cb599a54931a..324f85e8d540 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -197,6 +197,7 @@ struct intel_engine_cs {
>   	struct i915_ctx_workarounds wa_ctx;
>
>   	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
> +	bool		irq_posted;
>   	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
>   	struct drm_i915_gem_request *trace_irq_req;
>   	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
>

Regards,

Tvrtko
Chris Wilson June 8, 2016, 9:35 a.m. UTC | #2
On Mon, Jun 06, 2016 at 04:34:27PM +0100, Tvrtko Ursulin wrote:
> 
> On 03/06/16 17:08, Chris Wilson wrote:
> >If we flag the seqno as potentially stale upon receiving an interrupt,
> >we can use that information to reduce the frequency that we apply the
> >heavyweight coherent seqno read (i.e. if we wake up a chain of waiters).
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_drv.h          | 15 ++++++++++++++-
> >  drivers/gpu/drm/i915/i915_irq.c          |  1 +
> >  drivers/gpu/drm/i915/intel_breadcrumbs.c | 16 ++++++++++------
> >  drivers/gpu/drm/i915/intel_ringbuffer.h  |  1 +
> >  4 files changed, 26 insertions(+), 7 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 4ddb9ff319cb..a71d08199d57 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -3935,7 +3935,20 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
> >  	 * but it is easier and safer to do it every time the waiter
> >  	 * is woken.
> >  	 */
> >-	if (engine->irq_seqno_barrier) {
> >+	if (engine->irq_seqno_barrier && READ_ONCE(engine->irq_posted)) {
> >+		/* The ordering of irq_posted versus applying the barrier
> >+		 * is crucial. The clearing of the current irq_posted must
> >+		 * be visible before we perform the barrier operation,
> >+		 * such that if a subsequent interrupt arrives, irq_posted
> >+		 * is reasserted and our task rewoken (which causes us to
> >+		 * do another __i915_request_irq_complete() immediately
> >+		 * and reapply the barrier). Conversely, if the clear
> >+		 * occurs after the barrier, then an interrupt that arrived
> >+		 * whilst we waited on the barrier would not trigger a
> >+		 * barrier on the next pass, and the read may not see the
> >+		 * seqno update.
> >+		 */
> >+		WRITE_ONCE(engine->irq_posted, false);
> 
> Why is this not smp_store_mb ?

We only require the ordering wrt to irq_seqno_barrier().

How about:

if (engine->irq_seqno_barrier &&
    cmpxchg_relaxed(&engine->irq_post, 1, 0)) {

Less shouty?

> >diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> >index 44346de39794..0f5fe114c204 100644
> >--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
> >+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> >@@ -43,12 +43,18 @@ static void intel_breadcrumbs_fake_irq(unsigned long data)
> >
> >  static void irq_enable(struct intel_engine_cs *engine)
> >  {
> >+	/* Enabling the IRQ may miss the generation of the interrupt, but
> >+	 * we still need to force the barrier before reading the seqno,
> >+	 * just in case.
> >+	 */
> >+	engine->irq_posted = true;
> 
> Should it be smp_store_mb here as well?

No, this is written/read on the same callchain.
-Chris
Tvrtko Ursulin June 8, 2016, 9:57 a.m. UTC | #3
On 08/06/16 10:35, Chris Wilson wrote:
> On Mon, Jun 06, 2016 at 04:34:27PM +0100, Tvrtko Ursulin wrote:
>>
>> On 03/06/16 17:08, Chris Wilson wrote:
>>> If we flag the seqno as potentially stale upon receiving an interrupt,
>>> we can use that information to reduce the frequency that we apply the
>>> heavyweight coherent seqno read (i.e. if we wake up a chain of waiters).
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h          | 15 ++++++++++++++-
>>>   drivers/gpu/drm/i915/i915_irq.c          |  1 +
>>>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 16 ++++++++++------
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  1 +
>>>   4 files changed, 26 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 4ddb9ff319cb..a71d08199d57 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -3935,7 +3935,20 @@ static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
>>>   	 * but it is easier and safer to do it every time the waiter
>>>   	 * is woken.
>>>   	 */
>>> -	if (engine->irq_seqno_barrier) {
>>> +	if (engine->irq_seqno_barrier && READ_ONCE(engine->irq_posted)) {
>>> +		/* The ordering of irq_posted versus applying the barrier
>>> +		 * is crucial. The clearing of the current irq_posted must
>>> +		 * be visible before we perform the barrier operation,
>>> +		 * such that if a subsequent interrupt arrives, irq_posted
>>> +		 * is reasserted and our task rewoken (which causes us to
>>> +		 * do another __i915_request_irq_complete() immediately
>>> +		 * and reapply the barrier). Conversely, if the clear
>>> +		 * occurs after the barrier, then an interrupt that arrived
>>> +		 * whilst we waited on the barrier would not trigger a
>>> +		 * barrier on the next pass, and the read may not see the
>>> +		 * seqno update.
>>> +		 */
>>> +		WRITE_ONCE(engine->irq_posted, false);
>>
>> Why is this not smp_store_mb ?
>
> We only require the ordering wrt to irq_seqno_barrier().
>
> How about:
>
> if (engine->irq_seqno_barrier &&
>      cmpxchg_relaxed(&engine->irq_post, 1, 0)) {
>
> Less shouty?

I think so.

>>> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> index 44346de39794..0f5fe114c204 100644
>>> --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
>>> @@ -43,12 +43,18 @@ static void intel_breadcrumbs_fake_irq(unsigned long data)
>>>
>>>   static void irq_enable(struct intel_engine_cs *engine)
>>>   {
>>> +	/* Enabling the IRQ may miss the generation of the interrupt, but
>>> +	 * we still need to force the barrier before reading the seqno,
>>> +	 * just in case.
>>> +	 */
>>> +	engine->irq_posted = true;
>>
>> Should it be smp_store_mb here as well?
>
> No, this is written/read on the same callchain.

Ah true.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4ddb9ff319cb..a71d08199d57 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3935,7 +3935,20 @@  static inline bool __i915_request_irq_complete(struct drm_i915_gem_request *req)
 	 * but it is easier and safer to do it every time the waiter
 	 * is woken.
 	 */
-	if (engine->irq_seqno_barrier) {
+	if (engine->irq_seqno_barrier && READ_ONCE(engine->irq_posted)) {
+		/* The ordering of irq_posted versus applying the barrier
+		 * is crucial. The clearing of the current irq_posted must
+		 * be visible before we perform the barrier operation,
+		 * such that if a subsequent interrupt arrives, irq_posted
+		 * is reasserted and our task rewoken (which causes us to
+		 * do another __i915_request_irq_complete() immediately
+		 * and reapply the barrier). Conversely, if the clear
+		 * occurs after the barrier, then an interrupt that arrived
+		 * whilst we waited on the barrier would not trigger a
+		 * barrier on the next pass, and the read may not see the
+		 * seqno update.
+		 */
+		WRITE_ONCE(engine->irq_posted, false);
 		engine->irq_seqno_barrier(engine);
 		if (i915_gem_request_completed(req))
 			return true;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c14eb57b5807..14b3d65bb604 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -976,6 +976,7 @@  static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 
 static void notify_ring(struct intel_engine_cs *engine)
 {
+	smp_store_mb(engine->irq_posted, true);
 	if (intel_engine_wakeup(engine)) {
 		trace_i915_gem_request_notify(engine);
 		engine->user_interrupts++;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 44346de39794..0f5fe114c204 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -43,12 +43,18 @@  static void intel_breadcrumbs_fake_irq(unsigned long data)
 
 static void irq_enable(struct intel_engine_cs *engine)
 {
+	/* Enabling the IRQ may miss the generation of the interrupt, but
+	 * we still need to force the barrier before reading the seqno,
+	 * just in case.
+	 */
+	engine->irq_posted = true;
 	WARN_ON(!engine->irq_get(engine));
 }
 
 static void irq_disable(struct intel_engine_cs *engine)
 {
 	engine->irq_put(engine);
+	engine->irq_posted = false;
 }
 
 static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
@@ -56,7 +62,6 @@  static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 	struct intel_engine_cs *engine =
 		container_of(b, struct intel_engine_cs, breadcrumbs);
 	struct drm_i915_private *i915 = engine->i915;
-	bool irq_posted = false;
 
 	assert_spin_locked(&b->lock);
 	if (b->rpm_wakelock)
@@ -72,10 +77,8 @@  static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 
 	/* No interrupts? Kick the waiter every jiffie! */
 	if (intel_irqs_enabled(i915)) {
-		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
+		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings))
 			irq_enable(engine);
-			irq_posted = true;
-		}
 		b->irq_enabled = true;
 	}
 
@@ -83,7 +86,7 @@  static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings))
 		mod_timer(&b->fake_irq, jiffies + 1);
 
-	return irq_posted;
+	return READ_ONCE(engine->irq_posted);
 }
 
 static void __intel_breadcrumbs_disable_irq(struct intel_breadcrumbs *b)
@@ -197,7 +200,8 @@  bool intel_engine_add_wait(struct intel_engine_cs *engine,
 			 * in case the seqno passed.
 			 */
 			__intel_breadcrumbs_enable_irq(b);
-			wake_up_process(to_wait(next)->task);
+			if (READ_ONCE(engine->irq_posted))
+				wake_up_process(to_wait(next)->task);
 		}
 
 		do {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index cb599a54931a..324f85e8d540 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -197,6 +197,7 @@  struct intel_engine_cs {
 	struct i915_ctx_workarounds wa_ctx;
 
 	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
+	bool		irq_posted;
 	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
 	struct drm_i915_gem_request *trace_irq_req;
 	bool __must_check (*irq_get)(struct intel_engine_cs *ring);