diff mbox

[13/13] drm/i915: Cache last IRQ seqno to reduce IRQ overhead

Message ID 1449839521-21958-14-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison Dec. 11, 2015, 1:12 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

The notify function can be called many times without the seqno
changing. A large number of duplicates are to prevent races due to the
requirement of not enabling interrupts until requested. However, when
interrupts are enabled the IRQ handle can be called multiple times
without the ring's seqno value changing. This patch reduces the
overhead of these extra calls by caching the last processed seqno
value and early exiting if it has not changed.

v3: New patch for series.

For: VIZ-5190
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         | 14 +++++++++++---
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

Comments

Tvrtko Ursulin Dec. 11, 2015, 2:28 p.m. UTC | #1
On 11/12/15 13:12, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> The notify function can be called many times without the seqno
> changing. A large number of duplicates are to prevent races due to the
> requirement of not enabling interrupts until requested. However, when
> interrupts are enabled the IRQ handle can be called multiple times
> without the ring's seqno value changing. This patch reduces the
> overhead of these extra calls by caching the last processed seqno
> value and early exiting if it has not changed.
>
> v3: New patch for series.
>
> For: VIZ-5190
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c         | 14 +++++++++++---
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>   2 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 279d79f..3c88678 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2457,6 +2457,8 @@ i915_gem_init_seqno(struct drm_device *dev, u32 seqno)
>
>   		for (j = 0; j < ARRAY_SIZE(ring->semaphore.sync_seqno); j++)
>   			ring->semaphore.sync_seqno[j] = 0;
> +
> +		ring->last_irq_seqno = 0;
>   	}
>
>   	return 0;
> @@ -2788,11 +2790,14 @@ void i915_gem_request_notify(struct intel_engine_cs *ring, bool fence_locked)
>   		return;
>   	}
>
> -	if (!fence_locked)
> -		spin_lock_irqsave(&ring->fence_lock, flags);
> -
>   	seqno = ring->get_seqno(ring, false);
>   	trace_i915_gem_request_notify(ring, seqno);
> +	if (seqno == ring->last_irq_seqno)
> +		return;
> +	ring->last_irq_seqno = seqno;

Hmmm.. do you want to make the check "seqno <= ring->last_irq_seqno" ?

Is there a possibility for some weird timing or caching issue where two 
callers get in and last_irq_seqno goes backwards? Not sure that it would 
cause a problem, but pattern is unusual and hard to understand for me.

Also check and the assignment would need to be under the spinlock I think.

> +
> +	if (!fence_locked)
> +		spin_lock_irqsave(&ring->fence_lock, flags);
>
>   	list_for_each_entry_safe(req, req_next, &ring->fence_signal_list, signal_link) {
>   		if (!req->cancelled) {
> @@ -3163,7 +3168,10 @@ static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
>   	 * Tidy up anything left over. This includes a call to
>   	 * i915_gem_request_notify() which will make sure that any requests
>   	 * that were on the signal pending list get also cleaned up.
> +	 * NB: The seqno cache must be cleared otherwise the notify call will
> +	 * simply return immediately.
>   	 */
> +	ring->last_irq_seqno = 0;
>   	i915_gem_retire_requests_ring(ring);
>
>   	/* Having flushed all requests from all queues, we know that all
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 9d09edb..1987abd 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -356,6 +356,7 @@ struct  intel_engine_cs {
>   	spinlock_t fence_lock;
>   	struct list_head fence_signal_list;
>   	struct list_head fence_unsignal_list;
> +	uint32_t last_irq_seqno;
>   };
>
>   bool intel_ring_initialized(struct intel_engine_cs *ring);
>

Regards,

Tvrtko
Chris Wilson Dec. 11, 2015, 2:55 p.m. UTC | #2
On Fri, Dec 11, 2015 at 01:12:01PM +0000, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> The notify function can be called many times without the seqno
> changing. A large number of duplicates are to prevent races due to the
> requirement of not enabling interrupts until requested. However, when
> interrupts are enabled the IRQ handle can be called multiple times
> without the ring's seqno value changing. This patch reduces the
> overhead of these extra calls by caching the last processed seqno
> value and early exiting if it has not changed.

This is just plain wrong. Every user-interrupt is preceded by a seqno
update.
-Chris
John Harrison Dec. 11, 2015, 3:35 p.m. UTC | #3
On 11/12/2015 14:55, Chris Wilson wrote:
> On Fri, Dec 11, 2015 at 01:12:01PM +0000, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The notify function can be called many times without the seqno
>> changing. A large number of duplicates are to prevent races due to the
>> requirement of not enabling interrupts until requested. However, when
>> interrupts are enabled the IRQ handle can be called multiple times
>> without the ring's seqno value changing. This patch reduces the
>> overhead of these extra calls by caching the last processed seqno
>> value and early exiting if it has not changed.
> This is just plain wrong. Every user-interrupt is preceded by a seqno
> update.
Except that mutiple interrupts can be coalesced if they occur too close 
together. The driver's IRQ handler still gets called for each individual 
interrupt but the first time it is run it sees the seqno for the last. 
Thus all the processing gets done on the first invocation. The multiple 
subsequent invocations (I have seen up to four I believe) then have 
nothing to do.

> -Chris
>
Chris Wilson Dec. 11, 2015, 4:07 p.m. UTC | #4
On Fri, Dec 11, 2015 at 03:35:54PM +0000, John Harrison wrote:
> On 11/12/2015 14:55, Chris Wilson wrote:
> >On Fri, Dec 11, 2015 at 01:12:01PM +0000, John.C.Harrison@Intel.com wrote:
> >>From: John Harrison <John.C.Harrison@Intel.com>
> >>
> >>The notify function can be called many times without the seqno
> >>changing. A large number of duplicates are to prevent races due to the
> >>requirement of not enabling interrupts until requested. However, when
> >>interrupts are enabled the IRQ handle can be called multiple times
> >>without the ring's seqno value changing. This patch reduces the
> >>overhead of these extra calls by caching the last processed seqno
> >>value and early exiting if it has not changed.
> >This is just plain wrong. Every user-interrupt is preceded by a seqno
> >update.
> Except that mutiple interrupts can be coalesced if they occur too
> close together. The driver's IRQ handler still gets called for each
> individual interrupt but the first time it is run it sees the seqno
> for the last. Thus all the processing gets done on the first
> invocation. The multiple subsequent invocations (I have seen up to
> four I believe) then have nothing to do.

Yes. That is not what you implied above, or by talk about caching the
seqno -- which is already cached. There is a reason why we don't do this
in the interrupt handler and are not about to do so again.
-Chris
John Harrison Dec. 14, 2015, 11:58 a.m. UTC | #5
On 11/12/2015 14:28, Tvrtko Ursulin wrote:
> On 11/12/15 13:12, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> The notify function can be called many times without the seqno
>> changing. A large number of duplicates are to prevent races due to the
>> requirement of not enabling interrupts until requested. However, when
>> interrupts are enabled the IRQ handle can be called multiple times
>> without the ring's seqno value changing. This patch reduces the
>> overhead of these extra calls by caching the last processed seqno
>> value and early exiting if it has not changed.
>>
>> v3: New patch for series.
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem.c         | 14 +++++++++++---
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>>   2 files changed, 12 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 279d79f..3c88678 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2457,6 +2457,8 @@ i915_gem_init_seqno(struct drm_device *dev, u32 
>> seqno)
>>
>>           for (j = 0; j < ARRAY_SIZE(ring->semaphore.sync_seqno); j++)
>>               ring->semaphore.sync_seqno[j] = 0;
>> +
>> +        ring->last_irq_seqno = 0;
>>       }
>>
>>       return 0;
>> @@ -2788,11 +2790,14 @@ void i915_gem_request_notify(struct 
>> intel_engine_cs *ring, bool fence_locked)
>>           return;
>>       }
>>
>> -    if (!fence_locked)
>> -        spin_lock_irqsave(&ring->fence_lock, flags);
>> -
>>       seqno = ring->get_seqno(ring, false);
>>       trace_i915_gem_request_notify(ring, seqno);
>> +    if (seqno == ring->last_irq_seqno)
>> +        return;
>> +    ring->last_irq_seqno = seqno;
>
> Hmmm.. do you want to make the check "seqno <= ring->last_irq_seqno" ?
>
> Is there a possibility for some weird timing or caching issue where 
> two callers get in and last_irq_seqno goes backwards? Not sure that it 
> would cause a problem, but pattern is unusual and hard to understand 
> for me.
The check is simply to prevent repeat processing of identical seqno 
values. The 'last_' value is never used for anything more complicated. 
If there is a very rare race condition where the repeat processing can 
still happen, it doesn't really matter too much.

> Also check and the assignment would need to be under the spinlock I 
> think.

The whole point is to not grab the spinlock if there is no work to do. 
Hence the seqno read and test must be done first. The assignment could 
potentially be done after the lock but if two different threads have 
made it that far concurrently then it doesn't really matter who does the 
write first. Most likely they are both processing the same seqno and in 
the really rare case of two concurrent threads actually reading two 
different (and both new) seqno values then there is no guarantee about 
which will take the lock first. So you are into the above situation of 
it doesn't really matter if there is then a third time around later that 
finds an 'incorrect' last value and goes through the processing sequence 
but with no work to do.


>> +
>> +    if (!fence_locked)
>> +        spin_lock_irqsave(&ring->fence_lock, flags);
>>
>>       list_for_each_entry_safe(req, req_next, 
>> &ring->fence_signal_list, signal_link) {
>>           if (!req->cancelled) {
>> @@ -3163,7 +3168,10 @@ static void i915_gem_reset_ring_cleanup(struct 
>> drm_i915_private *dev_priv,
>>        * Tidy up anything left over. This includes a call to
>>        * i915_gem_request_notify() which will make sure that any 
>> requests
>>        * that were on the signal pending list get also cleaned up.
>> +     * NB: The seqno cache must be cleared otherwise the notify call 
>> will
>> +     * simply return immediately.
>>        */
>> +    ring->last_irq_seqno = 0;
>>       i915_gem_retire_requests_ring(ring);
>>
>>       /* Having flushed all requests from all queues, we know that all
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
>> b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 9d09edb..1987abd 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -356,6 +356,7 @@ struct  intel_engine_cs {
>>       spinlock_t fence_lock;
>>       struct list_head fence_signal_list;
>>       struct list_head fence_unsignal_list;
>> +    uint32_t last_irq_seqno;
>>   };
>>
>>   bool intel_ring_initialized(struct intel_engine_cs *ring);
>>
>
> Regards,
>
> Tvrtko
Tvrtko Ursulin Dec. 14, 2015, 12:52 p.m. UTC | #6
On 14/12/15 11:58, John Harrison wrote:
> On 11/12/2015 14:28, Tvrtko Ursulin wrote:
>> On 11/12/15 13:12, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> The notify function can be called many times without the seqno
>>> changing. A large number of duplicates are to prevent races due to the
>>> requirement of not enabling interrupts until requested. However, when
>>> interrupts are enabled the IRQ handle can be called multiple times
>>> without the ring's seqno value changing. This patch reduces the
>>> overhead of these extra calls by caching the last processed seqno
>>> value and early exiting if it has not changed.
>>>
>>> v3: New patch for series.
>>>
>>> For: VIZ-5190
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gem.c         | 14 +++++++++++---
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 +
>>>   2 files changed, 12 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>>> b/drivers/gpu/drm/i915/i915_gem.c
>>> index 279d79f..3c88678 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -2457,6 +2457,8 @@ i915_gem_init_seqno(struct drm_device *dev, u32
>>> seqno)
>>>
>>>           for (j = 0; j < ARRAY_SIZE(ring->semaphore.sync_seqno); j++)
>>>               ring->semaphore.sync_seqno[j] = 0;
>>> +
>>> +        ring->last_irq_seqno = 0;
>>>       }
>>>
>>>       return 0;
>>> @@ -2788,11 +2790,14 @@ void i915_gem_request_notify(struct
>>> intel_engine_cs *ring, bool fence_locked)
>>>           return;
>>>       }
>>>
>>> -    if (!fence_locked)
>>> -        spin_lock_irqsave(&ring->fence_lock, flags);
>>> -
>>>       seqno = ring->get_seqno(ring, false);
>>>       trace_i915_gem_request_notify(ring, seqno);
>>> +    if (seqno == ring->last_irq_seqno)
>>> +        return;
>>> +    ring->last_irq_seqno = seqno;
>>
>> Hmmm.. do you want to make the check "seqno <= ring->last_irq_seqno" ?
>>
>> Is there a possibility for some weird timing or caching issue where
>> two callers get in and last_irq_seqno goes backwards? Not sure that it
>> would cause a problem, but pattern is unusual and hard to understand
>> for me.
> The check is simply to prevent repeat processing of identical seqno
> values. The 'last_' value is never used for anything more complicated.
> If there is a very rare race condition where the repeat processing can
> still happen, it doesn't really matter too much.
>
>> Also check and the assignment would need to be under the spinlock I
>> think.
>
> The whole point is to not grab the spinlock if there is no work to do.
> Hence the seqno read and test must be done first. The assignment could
> potentially be done after the lock but if two different threads have
> made it that far concurrently then it doesn't really matter who does the
> write first. Most likely they are both processing the same seqno and in
> the really rare case of two concurrent threads actually reading two
> different (and both new) seqno values then there is no guarantee about
> which will take the lock first. So you are into the above situation of
> it doesn't really matter if there is then a third time around later that
> finds an 'incorrect' last value and goes through the processing sequence
> but with no work to do.

I think it would be good to put that in the comment then. :)

That you don't care about multiple notify processing running if the 
timing is right, or that you don't care if ring->last_irq_seqno does not 
reflect the last processed seqno. Etc.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 279d79f..3c88678 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2457,6 +2457,8 @@  i915_gem_init_seqno(struct drm_device *dev, u32 seqno)
 
 		for (j = 0; j < ARRAY_SIZE(ring->semaphore.sync_seqno); j++)
 			ring->semaphore.sync_seqno[j] = 0;
+
+		ring->last_irq_seqno = 0;
 	}
 
 	return 0;
@@ -2788,11 +2790,14 @@  void i915_gem_request_notify(struct intel_engine_cs *ring, bool fence_locked)
 		return;
 	}
 
-	if (!fence_locked)
-		spin_lock_irqsave(&ring->fence_lock, flags);
-
 	seqno = ring->get_seqno(ring, false);
 	trace_i915_gem_request_notify(ring, seqno);
+	if (seqno == ring->last_irq_seqno)
+		return;
+	ring->last_irq_seqno = seqno;
+
+	if (!fence_locked)
+		spin_lock_irqsave(&ring->fence_lock, flags);
 
 	list_for_each_entry_safe(req, req_next, &ring->fence_signal_list, signal_link) {
 		if (!req->cancelled) {
@@ -3163,7 +3168,10 @@  static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
 	 * Tidy up anything left over. This includes a call to
 	 * i915_gem_request_notify() which will make sure that any requests
 	 * that were on the signal pending list get also cleaned up.
+	 * NB: The seqno cache must be cleared otherwise the notify call will
+	 * simply return immediately.
 	 */
+	ring->last_irq_seqno = 0;
 	i915_gem_retire_requests_ring(ring);
 
 	/* Having flushed all requests from all queues, we know that all
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9d09edb..1987abd 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -356,6 +356,7 @@  struct  intel_engine_cs {
 	spinlock_t fence_lock;
 	struct list_head fence_signal_list;
 	struct list_head fence_unsignal_list;
+	uint32_t last_irq_seqno;
 };
 
 bool intel_ring_initialized(struct intel_engine_cs *ring);