diff mbox

[03/15] drm/i915: Only spin whilst waiting on the current request

Message ID 1448786893-2522-4-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Nov. 29, 2015, 8:48 a.m. UTC
Limit busywaiting only to the request currently being processed by the
GPU. If the request is not currently being processed by the GPU, there
is a very low likelihood of it being completed within the 2 microsecond
spin timeout and so we will just be wasting CPU cycles.

v2: Check for logical inversion when rebasing - we were incorrectly
checking for this request being active, and instead busywaiting for
when the GPU was not yet processing the request of interest.

v3: Try another colour for the seqno names.
v4: Another colour for the function names.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
Cc: stable@kernel.vger.org
---
 drivers/gpu/drm/i915/i915_drv.h | 27 +++++++++++++++++++--------
 drivers/gpu/drm/i915/i915_gem.c |  8 +++++++-
 2 files changed, 26 insertions(+), 9 deletions(-)

Comments

Tvrtko Ursulin Nov. 30, 2015, 10:06 a.m. UTC | #1
On 29/11/15 08:48, Chris Wilson wrote:
> Limit busywaiting only to the request currently being processed by the
> GPU. If the request is not currently being processed by the GPU, there
> is a very low likelihood of it being completed within the 2 microsecond
> spin timeout and so we will just be wasting CPU cycles.
>
> v2: Check for logical inversion when rebasing - we were incorrectly
> checking for this request being active, and instead busywaiting for
> when the GPU was not yet processing the request of interest.
>
> v3: Try another colour for the seqno names.
> v4: Another colour for the function names.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>   drivers/gpu/drm/i915/i915_drv.h | 27 +++++++++++++++++++--------
>   drivers/gpu/drm/i915/i915_gem.c |  8 +++++++-
>   2 files changed, 26 insertions(+), 9 deletions(-)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index bc865e234df2..d07041c1729d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2182,8 +2182,17 @@ struct drm_i915_gem_request {
>   	struct drm_i915_private *i915;
>   	struct intel_engine_cs *ring;
>
> -	/** GEM sequence number associated with this request. */
> -	uint32_t seqno;
> +	 /** GEM sequence number associated with the previous request,
> +	  * when the HWS breadcrumb is equal to this the GPU is processing
> +	  * this request.
> +	  */
> +	u32 previous_seqno;
> +
> +	 /** GEM sequence number associated with this request,
> +	  * when the HWS breadcrumb is equal or greater than this the GPU
> +	  * has finished processing this request.
> +	  */
> +	u32 seqno;
>
>   	/** Position in the ringbuffer of the start of the request */
>   	u32 head;
> @@ -2945,15 +2954,17 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>   	return (int32_t)(seq1 - seq2) >= 0;
>   }
>
> +static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
> +					   bool lazy_coherency)
> +{
> +	u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
> +	return i915_seqno_passed(seqno, req->previous_seqno);
> +}
> +
>   static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>   					      bool lazy_coherency)
>   {
> -	u32 seqno;
> -
> -	BUG_ON(req == NULL);
> -
> -	seqno = req->ring->get_seqno(req->ring, lazy_coherency);
> -
> +	u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
>   	return i915_seqno_passed(seqno, req->seqno);
>   }
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index bad112abb16b..ada461e02718 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1193,9 +1193,13 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   	 * takes to sleep on a request, on the order of a microsecond.
>   	 */
>
> -	if (i915_gem_request_get_ring(req)->irq_refcount)
> +	if (req->ring->irq_refcount)
>   		return -EBUSY;
>
> +	/* Only spin if we know the GPU is processing this request */
> +	if (!i915_gem_request_started(req, false))
> +		return -EAGAIN;
> +
>   	timeout = local_clock_us(&cpu) + 10;
>   	while (!need_resched()) {
>   		if (i915_gem_request_completed(req, true))
> @@ -1209,6 +1213,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>
>   		cpu_relax_lowlatency();
>   	}
> +
>   	if (i915_gem_request_completed(req, false))
>   		return 0;
>
> @@ -2592,6 +2597,7 @@ void __i915_add_request(struct drm_i915_gem_request *request,
>   	request->batch_obj = obj;
>
>   	request->emitted_jiffies = jiffies;
> +	request->previous_seqno = ring->last_submitted_seqno;
>   	ring->last_submitted_seqno = request->seqno;
>   	list_add_tail(&request->list, &ring->request_list);
>
>
Dave Gordon Dec. 1, 2015, 3:47 p.m. UTC | #2
On 30/11/15 10:06, Tvrtko Ursulin wrote:
>
> On 29/11/15 08:48, Chris Wilson wrote:
>> Limit busywaiting only to the request currently being processed by the
>> GPU. If the request is not currently being processed by the GPU, there
>> is a very low likelihood of it being completed within the 2 microsecond
>> spin timeout and so we will just be wasting CPU cycles.
>>
>> v2: Check for logical inversion when rebasing - we were incorrectly
>> checking for this request being active, and instead busywaiting for
>> when the GPU was not yet processing the request of interest.
>>
>> v3: Try another colour for the seqno names.
>> v4: Another colour for the function names.

Adding a field in the request to track the sequence number of the 
previous request isn't ideal when considering the scheduler and 
preemption. But we've got a separate batch-in-progress sequence number 
in the hardware status page (also for use by TDR to check which batch is 
currently running), so you could use that. Then the check is simply

     curr = intel_read_status_page(ring, I915_BATCH_ACTIVE_SEQNO);
     if (curr != req->seqno)
         return -EAGAIN;

.DAve.
Chris Wilson Dec. 1, 2015, 3:58 p.m. UTC | #3
On Tue, Dec 01, 2015 at 03:47:34PM +0000, Dave Gordon wrote:
> On 30/11/15 10:06, Tvrtko Ursulin wrote:
> >
> >On 29/11/15 08:48, Chris Wilson wrote:
> >>Limit busywaiting only to the request currently being processed by the
> >>GPU. If the request is not currently being processed by the GPU, there
> >>is a very low likelihood of it being completed within the 2 microsecond
> >>spin timeout and so we will just be wasting CPU cycles.
> >>
> >>v2: Check for logical inversion when rebasing - we were incorrectly
> >>checking for this request being active, and instead busywaiting for
> >>when the GPU was not yet processing the request of interest.
> >>
> >>v3: Try another colour for the seqno names.
> >>v4: Another colour for the function names.
> 
> Adding a field in the request to track the sequence number of the
> previous request isn't ideal when considering the scheduler and
> preemption. But we've got a separate batch-in-progress sequence
> number in the hardware status page (also for use by TDR to check
> which batch is currently running), so you could use that. Then the
> check is simply

As demonstrated TDR doesn't need it, and the GPU scheduler has to fix up
the seqno anyway to maintain retirement order of the request list.
-Chris
Dave Gordon Dec. 1, 2015, 4:44 p.m. UTC | #4
On 01/12/15 15:58, Chris Wilson wrote:
> On Tue, Dec 01, 2015 at 03:47:34PM +0000, Dave Gordon wrote:
>> On 30/11/15 10:06, Tvrtko Ursulin wrote:
>>>
>>> On 29/11/15 08:48, Chris Wilson wrote:
>>>> Limit busywaiting only to the request currently being processed by the
>>>> GPU. If the request is not currently being processed by the GPU, there
>>>> is a very low likelihood of it being completed within the 2 microsecond
>>>> spin timeout and so we will just be wasting CPU cycles.
>>>>
>>>> v2: Check for logical inversion when rebasing - we were incorrectly
>>>> checking for this request being active, and instead busywaiting for
>>>> when the GPU was not yet processing the request of interest.
>>>>
>>>> v3: Try another colour for the seqno names.
>>>> v4: Another colour for the function names.
>>
>> Adding a field in the request to track the sequence number of the
>> previous request isn't ideal when considering the scheduler and
>> preemption. But we've got a separate batch-in-progress sequence
>> number in the hardware status page (also for use by TDR to check
>> which batch is currently running), so you could use that. Then the
>> check is simply
>
> As demonstrated TDR doesn't need it, and the GPU scheduler has to fix up
> the seqno anyway to maintain retirement order of the request list.
> -Chris

If the request list is in retirement order, then the previous-seqno of a 
request is simply the seqno of the previous request in the request list. 
If there isn't a previous request, then the request under consideration 
is at the head of the list and must be the currently-active request 
anyway (of course, the converse does not hold, due to lazy retirement 
and such).

The scheduler will reassign seqnos as batches are submitted to the 
backend submission mechanism (ringbuffer, execlist, or GuC), so they 
will start out in order. But that doesn't guarantee in-order completion 
once we add preemption, so we will in any case be tracking both the 
start and finish of each batch.

The (Android) TDR previously worked on by Tomas Elf did make use of the 
batch-started seqno; at least some of the enhancements in that version 
of TDR are expected to be ported into the upstream Linux soon.

BTW I would find it easier to test & review these patches if they were 
in a different order, specifically I'd prefer to see the infrastructure 
updates (patches 9-12: lazy coherency, get rid of get_seqno(), etc) 
before the features that they help simplify (spin-wait optimisation, 
thundering herd suppression, etc).

.Dave.
Daniel Vetter Dec. 3, 2015, 8:52 a.m. UTC | #5
On Tue, Dec 01, 2015 at 04:44:53PM +0000, Dave Gordon wrote:
> On 01/12/15 15:58, Chris Wilson wrote:
> >On Tue, Dec 01, 2015 at 03:47:34PM +0000, Dave Gordon wrote:
> >>On 30/11/15 10:06, Tvrtko Ursulin wrote:
> >>>
> >>>On 29/11/15 08:48, Chris Wilson wrote:
> >>>>Limit busywaiting only to the request currently being processed by the
> >>>>GPU. If the request is not currently being processed by the GPU, there
> >>>>is a very low likelihood of it being completed within the 2 microsecond
> >>>>spin timeout and so we will just be wasting CPU cycles.
> >>>>
> >>>>v2: Check for logical inversion when rebasing - we were incorrectly
> >>>>checking for this request being active, and instead busywaiting for
> >>>>when the GPU was not yet processing the request of interest.
> >>>>
> >>>>v3: Try another colour for the seqno names.
> >>>>v4: Another colour for the function names.
> >>
> >>Adding a field in the request to track the sequence number of the
> >>previous request isn't ideal when considering the scheduler and
> >>preemption. But we've got a separate batch-in-progress sequence
> >>number in the hardware status page (also for use by TDR to check
> >>which batch is currently running), so you could use that. Then the
> >>check is simply
> >
> >As demonstrated TDR doesn't need it, and the GPU scheduler has to fix up
> >the seqno anyway to maintain retirement order of the request list.
> >-Chris
> 
> If the request list is in retirement order, then the previous-seqno of a
> request is simply the seqno of the previous request in the request list. If
> there isn't a previous request, then the request under consideration is at
> the head of the list and must be the currently-active request anyway (of
> course, the converse does not hold, due to lazy retirement and such).
> 
> The scheduler will reassign seqnos as batches are submitted to the backend
> submission mechanism (ringbuffer, execlist, or GuC), so they will start out
> in order. But that doesn't guarantee in-order completion once we add
> preemption, so we will in any case be tracking both the start and finish of
> each batch.

If the scheduler stops reassigning seqnos and instead hands out seqnos
per-ctx all these problems go away. And a metric pile more problems too
...

Cheers, Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bc865e234df2..d07041c1729d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2182,8 +2182,17 @@  struct drm_i915_gem_request {
 	struct drm_i915_private *i915;
 	struct intel_engine_cs *ring;
 
-	/** GEM sequence number associated with this request. */
-	uint32_t seqno;
+	 /** GEM sequence number associated with the previous request,
+	  * when the HWS breadcrumb is equal to this the GPU is processing
+	  * this request.
+	  */
+	u32 previous_seqno;
+
+	 /** GEM sequence number associated with this request,
+	  * when the HWS breadcrumb is equal or greater than this the GPU
+	  * has finished processing this request.
+	  */
+	u32 seqno;
 
 	/** Position in the ringbuffer of the start of the request */
 	u32 head;
@@ -2945,15 +2954,17 @@  i915_seqno_passed(uint32_t seq1, uint32_t seq2)
 	return (int32_t)(seq1 - seq2) >= 0;
 }
 
+static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
+					   bool lazy_coherency)
+{
+	u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
+	return i915_seqno_passed(seqno, req->previous_seqno);
+}
+
 static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
 					      bool lazy_coherency)
 {
-	u32 seqno;
-
-	BUG_ON(req == NULL);
-
-	seqno = req->ring->get_seqno(req->ring, lazy_coherency);
-
+	u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
 	return i915_seqno_passed(seqno, req->seqno);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bad112abb16b..ada461e02718 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1193,9 +1193,13 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	if (i915_gem_request_get_ring(req)->irq_refcount)
+	if (req->ring->irq_refcount)
 		return -EBUSY;
 
+	/* Only spin if we know the GPU is processing this request */
+	if (!i915_gem_request_started(req, false))
+		return -EAGAIN;
+
 	timeout = local_clock_us(&cpu) + 10;
 	while (!need_resched()) {
 		if (i915_gem_request_completed(req, true))
@@ -1209,6 +1213,7 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 
 		cpu_relax_lowlatency();
 	}
+
 	if (i915_gem_request_completed(req, false))
 		return 0;
 
@@ -2592,6 +2597,7 @@  void __i915_add_request(struct drm_i915_gem_request *request,
 	request->batch_obj = obj;
 
 	request->emitted_jiffies = jiffies;
+	request->previous_seqno = ring->last_submitted_seqno;
 	ring->last_submitted_seqno = request->seqno;
 	list_add_tail(&request->list, &ring->request_list);