diff mbox

[17/32] drm/i915: Remove the lazy_coherency parameter from request-completed?

Message ID 1449833608-22125-18-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 11, 2015, 11:33 a.m. UTC
Now that we have split out the seqno-barrier from the
engine->get_seqno() callback itself, we can move the users of the
seqno-barrier to the required callsites simplifying the common code and
making the required workaround handling much more explicit.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c  |  4 ++--
 drivers/gpu/drm/i915/i915_drv.h      | 10 ++--------
 drivers/gpu/drm/i915/i915_gem.c      | 24 +++++++++++++++---------
 drivers/gpu/drm/i915/intel_display.c |  2 +-
 drivers/gpu/drm/i915/intel_pm.c      |  4 ++--
 5 files changed, 22 insertions(+), 22 deletions(-)

Comments

Tvrtko Ursulin Dec. 14, 2015, 2:59 p.m. UTC | #1
Hi,

On 11/12/15 11:33, Chris Wilson wrote:
> Now that we have split out the seqno-barrier from the
> engine->get_seqno() callback itself, we can move the users of the
> seqno-barrier to the required callsites simplifying the common code and
> making the required workaround handling much more explicit.

What bothers me about this patch, and the one preceding it, is that I 
don't see a tangible improvement for the programmer who still has to 
know when to read the seqno and when to "read it harder, read for real".

Barrier in this sense has a relation to the state of things but somehow 
feels too low level to me when used from the code. But to be fair I am 
not sure how to better define it.

Would ring->get_seqno paired with ring->read_seqno perhaps make sense? 
Implementation for ring->read_seqno would just be a flush followed by 
ring->get_seqno then. Or maybe keep the barrier and add ring->read_seqno 
which would be ring->seqno_barrier + ring_get_seqno?

Regards,

Tvrtko

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c  |  4 ++--
>   drivers/gpu/drm/i915/i915_drv.h      | 10 ++--------
>   drivers/gpu/drm/i915/i915_gem.c      | 24 +++++++++++++++---------
>   drivers/gpu/drm/i915/intel_display.c |  2 +-
>   drivers/gpu/drm/i915/intel_pm.c      |  4 ++--
>   5 files changed, 22 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 6344fe69ab82..8860dec36aae 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -601,7 +601,7 @@ static int i915_gem_pageflip_info(struct seq_file *m, void *data)
>   					   i915_gem_request_get_seqno(work->flip_queued_req),
>   					   dev_priv->next_seqno,
>   					   ring->get_seqno(ring),
> -					   i915_gem_request_completed(work->flip_queued_req, true));
> +					   i915_gem_request_completed(work->flip_queued_req));
>   			} else
>   				seq_printf(m, "Flip not associated with any ring\n");
>   			seq_printf(m, "Flip queued on frame %d, (was ready on frame %d), now %d\n",
> @@ -1353,8 +1353,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>   	intel_runtime_pm_get(dev_priv);
>
>   	for_each_ring(ring, dev_priv, i) {
> -		seqno[i] = ring->get_seqno(ring);
>   		acthd[i] = intel_ring_get_active_head(ring);
> +		seqno[i] = ring->get_seqno(ring);
>   	}
>
>   	intel_runtime_pm_put(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ff83f148658f..d099e960f9b8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2978,20 +2978,14 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>   	return (int32_t)(seq1 - seq2) >= 0;
>   }
>
> -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
> -					   bool lazy_coherency)
> +static inline bool i915_gem_request_started(struct drm_i915_gem_request *req)
>   {
> -	if (!lazy_coherency && req->ring->seqno_barrier)
> -		req->ring->seqno_barrier(req->ring);
>   	return i915_seqno_passed(req->ring->get_seqno(req->ring),
>   				 req->previous_seqno);
>   }
>
> -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
> -					      bool lazy_coherency)
> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req)
>   {
> -	if (!lazy_coherency && req->ring->seqno_barrier)
> -		req->ring->seqno_barrier(req->ring);
>   	return i915_seqno_passed(req->ring->get_seqno(req->ring),
>   				 req->seqno);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index fa0cf6c9f4d0..f3c1e268f614 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1173,12 +1173,12 @@ static bool __i915_spin_request(struct drm_i915_gem_request *req,
>   	 */
>
>   	/* Only spin if we know the GPU is processing this request */
> -	if (!i915_gem_request_started(req, true))
> +	if (!i915_gem_request_started(req))
>   		return false;
>
>   	timeout = local_clock_us(&cpu) + 5;
>   	do {
> -		if (i915_gem_request_completed(req, true))
> +		if (i915_gem_request_completed(req))
>   			return true;
>
>   		if (signal_pending_state(state, wait->task))
> @@ -1230,7 +1230,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	if (list_empty(&req->list))
>   		return 0;
>
> -	if (i915_gem_request_completed(req, true))
> +	if (i915_gem_request_completed(req))
>   		return 0;
>
>   	timeout_remain = MAX_SCHEDULE_TIMEOUT;
> @@ -1299,7 +1299,10 @@ wakeup:		set_task_state(wait.task, state);
>   		 * but it is easier and safer to do it every time the waiter
>   		 * is woken.
>   		 */
> -		if (i915_gem_request_completed(req, false))
> +		if (req->ring->seqno_barrier)
> +			req->ring->seqno_barrier(req->ring);
> +
> +		if (i915_gem_request_completed(req))
>   			break;
>
>   		/* We need to check whether any gpu reset happened in between
> @@ -2731,8 +2734,11 @@ i915_gem_find_active_request(struct intel_engine_cs *ring)
>   {
>   	struct drm_i915_gem_request *request;
>
> +	if (ring->seqno_barrier)
> +		ring->seqno_barrier(ring);
> +
>   	list_for_each_entry(request, &ring->request_list, list) {
> -		if (i915_gem_request_completed(request, false))
> +		if (i915_gem_request_completed(request))
>   			continue;
>
>   		return request;
> @@ -2873,7 +2879,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>   					   struct drm_i915_gem_request,
>   					   list);
>
> -		if (!i915_gem_request_completed(request, true))
> +		if (!i915_gem_request_completed(request))
>   			break;
>
>   		i915_gem_request_retire(request);
> @@ -2897,7 +2903,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
>   	}
>
>   	if (unlikely(ring->trace_irq_req &&
> -		     i915_gem_request_completed(ring->trace_irq_req, true))) {
> +		     i915_gem_request_completed(ring->trace_irq_req))) {
>   		ring->irq_put(ring);
>   		i915_gem_request_assign(&ring->trace_irq_req, NULL);
>   	}
> @@ -3007,7 +3013,7 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
>   		if (list_empty(&req->list))
>   			goto retire;
>
> -		if (i915_gem_request_completed(req, true)) {
> +		if (i915_gem_request_completed(req)) {
>   			__i915_gem_request_retire__upto(req);
>   retire:
>   			i915_gem_object_retire__read(obj, i);
> @@ -3116,7 +3122,7 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj,
>   	if (to == from)
>   		return 0;
>
> -	if (i915_gem_request_completed(from_req, true))
> +	if (i915_gem_request_completed(from_req))
>   		return 0;
>
>   	if (!i915_semaphore_is_enabled(obj->base.dev)) {
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 875bdf814d73..ffcdc2c631e1 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -11459,7 +11459,7 @@ static bool __intel_pageflip_stall_check(struct drm_device *dev,
>
>   	if (work->flip_ready_vblank == 0) {
>   		if (work->flip_queued_req &&
> -		    !i915_gem_request_completed(work->flip_queued_req, true))
> +		    !i915_gem_request_completed(work->flip_queued_req))
>   			return false;
>
>   		work->flip_ready_vblank = drm_crtc_vblank_count(crtc);
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 99f2642fd5df..570628628a90 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -7188,7 +7188,7 @@ static void __intel_rps_boost_work(struct work_struct *work)
>   	struct request_boost *boost = container_of(work, struct request_boost, work);
>   	struct drm_i915_gem_request *req = boost->req;
>
> -	if (!i915_gem_request_completed(req, true))
> +	if (!i915_gem_request_completed(req))
>   		gen6_rps_boost(to_i915(req->ring->dev), NULL,
>   			       req->emitted_jiffies);
>
> @@ -7204,7 +7204,7 @@ void intel_queue_rps_boost_for_request(struct drm_device *dev,
>   	if (req == NULL || INTEL_INFO(dev)->gen < 6)
>   		return;
>
> -	if (i915_gem_request_completed(req, true))
> +	if (i915_gem_request_completed(req))
>   		return;
>
>   	boost = kmalloc(sizeof(*boost), GFP_ATOMIC);
>
Chris Wilson Dec. 14, 2015, 3:11 p.m. UTC | #2
On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 11/12/15 11:33, Chris Wilson wrote:
> >Now that we have split out the seqno-barrier from the
> >engine->get_seqno() callback itself, we can move the users of the
> >seqno-barrier to the required callsites simplifying the common code and
> >making the required workaround handling much more explicit.
> 
> What bothers me about this patch, and the one preceding it, is that
> I don't see a tangible improvement for the programmer who still has
> to know when to read the seqno and when to "read it harder, read for
> real".

In earlier patches, I called it irq_barrier.

It's not reading it harder. It's just that there is a ordering issue
with receiving an interrupt and the seqno write being visible.

> Barrier in this sense has a relation to the state of things but
> somehow feels too low level to me when used from the code. But to be
> fair I am not sure how to better define it.
> 
> Would ring->get_seqno paired with ring->read_seqno perhaps make
> sense? Implementation for ring->read_seqno would just be a flush
> followed by ring->get_seqno then. Or maybe keep the barrier and add
> ring->read_seqno which would be ring->seqno_barrier +
> ring_get_seqno?

No.
-Chris
Dave Gordon Jan. 4, 2016, 11:16 a.m. UTC | #3
On 14/12/15 15:11, Chris Wilson wrote:
> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 11/12/15 11:33, Chris Wilson wrote:
>>> Now that we have split out the seqno-barrier from the
>>> engine->get_seqno() callback itself, we can move the users of the
>>> seqno-barrier to the required callsites simplifying the common code and
>>> making the required workaround handling much more explicit.
>>
>> What bothers me about this patch, and the one preceding it, is that
>> I don't see a tangible improvement for the programmer who still has
>> to know when to read the seqno and when to "read it harder, read for
>> real".
>
> In earlier patches, I called it irq_barrier.
>
> It's not reading it harder. It's just that there is a ordering issue
> with receiving an interrupt and the seqno write being visible.
>
>> Barrier in this sense has a relation to the state of things but
>> somehow feels too low level to me when used from the code. But to be
>> fair I am not sure how to better define it.
>>
>> Would ring->get_seqno paired with ring->read_seqno perhaps make
>> sense? Implementation for ring->read_seqno would just be a flush
>> followed by ring->get_seqno then. Or maybe keep the barrier and add
>> ring->read_seqno which would be ring->seqno_barrier +
>> ring_get_seqno?
>
> No.
> -Chris

We could instead put the knowledge about whether and how to read "for 
real" inside the read-the-seqno function. For example:

	struct intel_engine_cs {
		...
		u32 last_seqno_seen;
	}

	u32 intel_ring_read_seqno(struct intel_engine_cs *engine) {
		// First try simple read
		u32 seqno = intel_ring_get_seqno(engine);

		if (seqno == engine->last_seqno_seen) {
			// Do additional flush, then try again
			engine->seqno_barrier(engine);
			seqno = intel_ring_get_seqno(engine);
		}

		engine->last_seqno_seen = seqno;
		return seqno;
	}

Then callers don't have to know anything about coherency; they can just 
assume that they will automatically get the latest value.

In the presumably common case where the value *has* been updated, and 
the cache has noted the update and invalidated the old local value, the 
first read will successfully find the updated seqno and return quickly.

Then we only do the extra work in the case where there would otherwise 
appear to be nothing to do (i.e. when we would sleep, or spin, or 
otherwise wait, if this function were to return the same value as last 
time).

.Dave.
Chris Wilson Jan. 4, 2016, 11:26 a.m. UTC | #4
On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
> On 14/12/15 15:11, Chris Wilson wrote:
> >On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
> >>
> >>Hi,
> >>
> >>On 11/12/15 11:33, Chris Wilson wrote:
> >>>Now that we have split out the seqno-barrier from the
> >>>engine->get_seqno() callback itself, we can move the users of the
> >>>seqno-barrier to the required callsites simplifying the common code and
> >>>making the required workaround handling much more explicit.
> >>
> >>What bothers me about this patch, and the one preceding it, is that
> >>I don't see a tangible improvement for the programmer who still has
> >>to know when to read the seqno and when to "read it harder, read for
> >>real".
> >
> >In earlier patches, I called it irq_barrier.
> >
> >It's not reading it harder. It's just that there is a ordering issue
> >with receiving an interrupt and the seqno write being visible.
> >
> >>Barrier in this sense has a relation to the state of things but
> >>somehow feels too low level to me when used from the code. But to be
> >>fair I am not sure how to better define it.
> >>
> >>Would ring->get_seqno paired with ring->read_seqno perhaps make
> >>sense? Implementation for ring->read_seqno would just be a flush
> >>followed by ring->get_seqno then. Or maybe keep the barrier and add
> >>ring->read_seqno which would be ring->seqno_barrier +
> >>ring_get_seqno?
> >
> >No.
> >-Chris
> 
> We could instead put the knowledge about whether and how to read
> "for real" inside the read-the-seqno function. For example:

You do appreciate the irony that you are on the reviewer list for patches
that do that?

http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34

There is just one place that we need the extra work, after the
interrupt. All other places only care about the current value of the
seqno in the CPU cache.
-Chris
Dave Gordon Jan. 4, 2016, 1:02 p.m. UTC | #5
On 04/01/16 11:26, Chris Wilson wrote:
> On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
>> On 14/12/15 15:11, Chris Wilson wrote:
>>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> Hi,
>>>>
>>>> On 11/12/15 11:33, Chris Wilson wrote:
>>>>> Now that we have split out the seqno-barrier from the
>>>>> engine->get_seqno() callback itself, we can move the users of the
>>>>> seqno-barrier to the required callsites simplifying the common code and
>>>>> making the required workaround handling much more explicit.
>>>>
>>>> What bothers me about this patch, and the one preceding it, is that
>>>> I don't see a tangible improvement for the programmer who still has
>>>> to know when to read the seqno and when to "read it harder, read for
>>>> real".
>>>
>>> In earlier patches, I called it irq_barrier.
>>>
>>> It's not reading it harder. It's just that there is a ordering issue
>>> with receiving an interrupt and the seqno write being visible.
>>>
>>>> Barrier in this sense has a relation to the state of things but
>>>> somehow feels too low level to me when used from the code. But to be
>>>> fair I am not sure how to better define it.
>>>>
>>>> Would ring->get_seqno paired with ring->read_seqno perhaps make
>>>> sense? Implementation for ring->read_seqno would just be a flush
>>>> followed by ring->get_seqno then. Or maybe keep the barrier and add
>>>> ring->read_seqno which would be ring->seqno_barrier +
>>>> ring_get_seqno?
>>>
>>> No.
>>> -Chris
>>
>> We could instead put the knowledge about whether and how to read
>> "for real" inside the read-the-seqno function. For example:
>
> You do appreciate the irony that you are on the reviewer list for patches
> that do that?
>
> http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34

No, I haven't got as far as that one, since it was posted over a week 
after the message at the head of this thread. Anyway, I still can't see 
in that patch anything equivalent to what I described above.

> There is just one place that we need the extra work, after the
> interrupt. All other places only care about the current value of the
> seqno in the CPU cache.
> -Chris

So, we could instead have a per-engine flag which says, interrupt has 
happened, next reader should expect an update (and try hard to see one)?

Flow would be:
     IRQ->handler
         sets flag, wakes first waiter, clears IRQs
     waiter wakes up
         reads seqno (read fn sees flag => coherent, clear flag if new)
         checks whether (only) its request is completed
             not complete: enable IRQ, sleep
             seqno match: dequeue request, enable IRQ, process completion
             match+: dequeue request, wake next, process completion

where 'process completion' involves updating the request state and 
waking all /additional/ waiters on the same request, whereas 'wake next' 
refers to threads waiting on the /next/ request.

Is that what your patch is trying to achieve? It's a bit hard to tell 
with the seqno-read-optimisation and irq-dispatch changes mixed in with 
the r-b-tree and all the other things in this sequence.

I think it would be easier to understand if some of the more obvious 
improvements (such as 18/32, 20/32, 24/32) were pushed in earlier, so 
that the code is as clear as possible before the patches that actually 
change the way things work are applied. And all the reset-related 
patches could be later, as that's an area with some subtlety to it.

.Dave.
Chris Wilson Jan. 4, 2016, 1:11 p.m. UTC | #6
On Mon, Jan 04, 2016 at 01:02:25PM +0000, Dave Gordon wrote:
> On 04/01/16 11:26, Chris Wilson wrote:
> >On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
> >>On 14/12/15 15:11, Chris Wilson wrote:
> >>>On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
> >>>>
> >>>>Hi,
> >>>>
> >>>>On 11/12/15 11:33, Chris Wilson wrote:
> >>>>>Now that we have split out the seqno-barrier from the
> >>>>>engine->get_seqno() callback itself, we can move the users of the
> >>>>>seqno-barrier to the required callsites simplifying the common code and
> >>>>>making the required workaround handling much more explicit.
> >>>>
> >>>>What bothers me about this patch, and the one preceding it, is that
> >>>>I don't see a tangible improvement for the programmer who still has
> >>>>to know when to read the seqno and when to "read it harder, read for
> >>>>real".
> >>>
> >>>In earlier patches, I called it irq_barrier.
> >>>
> >>>It's not reading it harder. It's just that there is a ordering issue
> >>>with receiving an interrupt and the seqno write being visible.
> >>>
> >>>>Barrier in this sense has a relation to the state of things but
> >>>>somehow feels too low level to me when used from the code. But to be
> >>>>fair I am not sure how to better define it.
> >>>>
> >>>>Would ring->get_seqno paired with ring->read_seqno perhaps make
> >>>>sense? Implementation for ring->read_seqno would just be a flush
> >>>>followed by ring->get_seqno then. Or maybe keep the barrier and add
> >>>>ring->read_seqno which would be ring->seqno_barrier +
> >>>>ring_get_seqno?
> >>>
> >>>No.
> >>>-Chris
> >>
> >>We could instead put the knowledge about whether and how to read
> >>"for real" inside the read-the-seqno function. For example:
> >
> >You do appreciate the irony that you are on the reviewer list for patches
> >that do that?
> >
> >http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34
> 
> No, I haven't got as far as that one, since it was posted over a
> week after the message at the head of this thread. Anyway, I still
> can't see in that patch anything equivalent to what I described
> above.
> 
> >There is just one place that we need the extra work, after the
> >interrupt. All other places only care about the current value of the
> >seqno in the CPU cache.
> >-Chris
> 
> So, we could instead have a per-engine flag which says, interrupt
> has happened, next reader should expect an update (and try hard to
> see one)?

Go back and read. That patch adds the commentary that should explain
what needs to be done and where, continue on in the series you can see
just that micro-optimisation along with the required barriers.
-Chris
Dave Gordon Jan. 4, 2016, 2:09 p.m. UTC | #7
On 04/01/16 13:02, Dave Gordon wrote:
> On 04/01/16 11:26, Chris Wilson wrote:
>> On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
>>> On 14/12/15 15:11, Chris Wilson wrote:
>>>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
>>>>>
>>>>> Hi,
>>>>>
>>>>> On 11/12/15 11:33, Chris Wilson wrote:
>>>>>> Now that we have split out the seqno-barrier from the
>>>>>> engine->get_seqno() callback itself, we can move the users of the
>>>>>> seqno-barrier to the required callsites simplifying the common
>>>>>> code and
>>>>>> making the required workaround handling much more explicit.
>>>>>
>>>>> What bothers me about this patch, and the one preceding it, is that
>>>>> I don't see a tangible improvement for the programmer who still has
>>>>> to know when to read the seqno and when to "read it harder, read for
>>>>> real".
>>>>
>>>> In earlier patches, I called it irq_barrier.
>>>>
>>>> It's not reading it harder. It's just that there is a ordering issue
>>>> with receiving an interrupt and the seqno write being visible.
>>>>
>>>>> Barrier in this sense has a relation to the state of things but
>>>>> somehow feels too low level to me when used from the code. But to be
>>>>> fair I am not sure how to better define it.
>>>>>
>>>>> Would ring->get_seqno paired with ring->read_seqno perhaps make
>>>>> sense? Implementation for ring->read_seqno would just be a flush
>>>>> followed by ring->get_seqno then. Or maybe keep the barrier and add
>>>>> ring->read_seqno which would be ring->seqno_barrier +
>>>>> ring_get_seqno?
>>>>
>>>> No.
>>>> -Chris
>>>
>>> We could instead put the knowledge about whether and how to read
>>> "for real" inside the read-the-seqno function. For example:
>>
>> You do appreciate the irony that you are on the reviewer list for patches
>> that do that?
>>
>> http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34
>
> No, I haven't got as far as that one, since it was posted over a week
> after the message at the head of this thread. Anyway, I still can't see
> in that patch anything equivalent to what I described above.

Oh, I spotted what you meant, but it's not in /that/ patch (which was a 
version of PATCH 15/32 (Slaughter the thundering i915_wait_request herd)
it's in PATCH 19/32 (Check the CPU cached value of seqno after waking 
the waiter).

Even so, it's not at the same level of code structure; I was suggesting 
pushing it all the way down, because __i915_wait_request() and/or 
i915_gem_request_completed() aren't the only functions that use it.

.Dave.
Chris Wilson Jan. 4, 2016, 2:20 p.m. UTC | #8
On Mon, Jan 04, 2016 at 02:09:53PM +0000, Dave Gordon wrote:
> On 04/01/16 13:02, Dave Gordon wrote:
> >On 04/01/16 11:26, Chris Wilson wrote:
> >>On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
> >>>On 14/12/15 15:11, Chris Wilson wrote:
> >>>>On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
> >>>>>
> >>>>>Hi,
> >>>>>
> >>>>>On 11/12/15 11:33, Chris Wilson wrote:
> >>>>>>Now that we have split out the seqno-barrier from the
> >>>>>>engine->get_seqno() callback itself, we can move the users of the
> >>>>>>seqno-barrier to the required callsites simplifying the common
> >>>>>>code and
> >>>>>>making the required workaround handling much more explicit.
> >>>>>
> >>>>>What bothers me about this patch, and the one preceding it, is that
> >>>>>I don't see a tangible improvement for the programmer who still has
> >>>>>to know when to read the seqno and when to "read it harder, read for
> >>>>>real".
> >>>>
> >>>>In earlier patches, I called it irq_barrier.
> >>>>
> >>>>It's not reading it harder. It's just that there is a ordering issue
> >>>>with receiving an interrupt and the seqno write being visible.
> >>>>
> >>>>>Barrier in this sense has a relation to the state of things but
> >>>>>somehow feels too low level to me when used from the code. But to be
> >>>>>fair I am not sure how to better define it.
> >>>>>
> >>>>>Would ring->get_seqno paired with ring->read_seqno perhaps make
> >>>>>sense? Implementation for ring->read_seqno would just be a flush
> >>>>>followed by ring->get_seqno then. Or maybe keep the barrier and add
> >>>>>ring->read_seqno which would be ring->seqno_barrier +
> >>>>>ring_get_seqno?
> >>>>
> >>>>No.
> >>>>-Chris
> >>>
> >>>We could instead put the knowledge about whether and how to read
> >>>"for real" inside the read-the-seqno function. For example:
> >>
> >>You do appreciate the irony that you are on the reviewer list for patches
> >>that do that?
> >>
> >>http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34
> >
> >No, I haven't got as far as that one, since it was posted over a week
> >after the message at the head of this thread. Anyway, I still can't see
> >in that patch anything equivalent to what I described above.
> 
> Oh, I spotted what you meant, but it's not in /that/ patch (which
> was a version of PATCH 15/32 (Slaughter the thundering
> i915_wait_request herd)
> it's in PATCH 19/32 (Check the CPU cached value of seqno after
> waking the waiter).
> 
> Even so, it's not at the same level of code structure; I was
> suggesting pushing it all the way down, because
> __i915_wait_request() and/or i915_gem_request_completed() aren't the
> only functions that use it.

No. I am arguing that there should be precisely one piece of code
responsible for seqno-vs-interrupt ordering. Everywhere else should not
have to worry about that interrupts may be asserted before the HWS write
is posted.
-Chris
Dave Gordon Jan. 4, 2016, 5:28 p.m. UTC | #9
On 04/01/16 14:20, Chris Wilson wrote:
> On Mon, Jan 04, 2016 at 02:09:53PM +0000, Dave Gordon wrote:
>> On 04/01/16 13:02, Dave Gordon wrote:
>>> On 04/01/16 11:26, Chris Wilson wrote:
>>>> On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote:
>>>>> On 14/12/15 15:11, Chris Wilson wrote:
>>>>>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 11/12/15 11:33, Chris Wilson wrote:
>>>>>>>> Now that we have split out the seqno-barrier from the
>>>>>>>> engine->get_seqno() callback itself, we can move the users of the
>>>>>>>> seqno-barrier to the required callsites simplifying the common
>>>>>>>> code and
>>>>>>>> making the required workaround handling much more explicit.
>>>>>>>
>>>>>>> What bothers me about this patch, and the one preceding it, is that
>>>>>>> I don't see a tangible improvement for the programmer who still has
>>>>>>> to know when to read the seqno and when to "read it harder, read for
>>>>>>> real".
>>>>>>
>>>>>> In earlier patches, I called it irq_barrier.
>>>>>>
>>>>>> It's not reading it harder. It's just that there is a ordering issue
>>>>>> with receiving an interrupt and the seqno write being visible.
>>>>>>
>>>>>>> Barrier in this sense has a relation to the state of things but
>>>>>>> somehow feels too low level to me when used from the code. But to be
>>>>>>> fair I am not sure how to better define it.
>>>>>>>
>>>>>>> Would ring->get_seqno paired with ring->read_seqno perhaps make
>>>>>>> sense? Implementation for ring->read_seqno would just be a flush
>>>>>>> followed by ring->get_seqno then. Or maybe keep the barrier and add
>>>>>>> ring->read_seqno which would be ring->seqno_barrier +
>>>>>>> ring_get_seqno?
>>>>>>
>>>>>> No.
>>>>>> -Chris
>>>>>
>>>>> We could instead put the knowledge about whether and how to read
>>>>> "for real" inside the read-the-seqno function. For example:
>>>>
>>>> You do appreciate the irony that you are on the reviewer list for patches
>>>> that do that?
>>>>
>>>> http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34
>>>
>>> No, I haven't got as far as that one, since it was posted over a week
>>> after the message at the head of this thread. Anyway, I still can't see
>>> in that patch anything equivalent to what I described above.
>>
>> Oh, I spotted what you meant, but it's not in /that/ patch
>> (which was a version of PATCH 15/32 "Slaughter the thundering
>> i915_wait_request herd") it's in PATCH 19/32 "Check the CPU
>> cached value of seqno after waking the waiter".
>>
>> Even so, it's not at the same level of code structure; I was
>> suggesting pushing it all the way down, because
>> __i915_wait_request() and/or i915_gem_request_completed() aren't the
>> only functions that use it.
>
> No. I am arguing that there should be precisely one piece of code
> responsible for seqno-vs-interrupt ordering. Everywhere else should not
> have to worry about that interrupts may be asserted before the HWS write
> is posted.
> -Chris

That's what /I/ said. So it should be inside the /lowest-level/ 
function, the one that hands back a h/w sequence number to other code 
that doesn't care how it was obtained; in other words, either 
intel_ring_get_seqno() or a wrapper round it that incorporates the 
check-and-flush, if you still want the option of looking only at the 
cached copy.

Whereas you put the read-barrier-read logic in __i915_wait_request() but 
that's too high a level, and I don't think that's the only caller where 
it could be advantageous to conditionally update the cached seqno from 
the HWSP; and the fact of reading the HWSP is hidden inside the call to 
i915_gem_request(), which is working at an entirely different level of 
abstraction.

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6344fe69ab82..8860dec36aae 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -601,7 +601,7 @@  static int i915_gem_pageflip_info(struct seq_file *m, void *data)
 					   i915_gem_request_get_seqno(work->flip_queued_req),
 					   dev_priv->next_seqno,
 					   ring->get_seqno(ring),
-					   i915_gem_request_completed(work->flip_queued_req, true));
+					   i915_gem_request_completed(work->flip_queued_req));
 			} else
 				seq_printf(m, "Flip not associated with any ring\n");
 			seq_printf(m, "Flip queued on frame %d, (was ready on frame %d), now %d\n",
@@ -1353,8 +1353,8 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_runtime_pm_get(dev_priv);
 
 	for_each_ring(ring, dev_priv, i) {
-		seqno[i] = ring->get_seqno(ring);
 		acthd[i] = intel_ring_get_active_head(ring);
+		seqno[i] = ring->get_seqno(ring);
 	}
 
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ff83f148658f..d099e960f9b8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2978,20 +2978,14 @@  i915_seqno_passed(uint32_t seq1, uint32_t seq2)
 	return (int32_t)(seq1 - seq2) >= 0;
 }
 
-static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
-					   bool lazy_coherency)
+static inline bool i915_gem_request_started(struct drm_i915_gem_request *req)
 {
-	if (!lazy_coherency && req->ring->seqno_barrier)
-		req->ring->seqno_barrier(req->ring);
 	return i915_seqno_passed(req->ring->get_seqno(req->ring),
 				 req->previous_seqno);
 }
 
-static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
-					      bool lazy_coherency)
+static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req)
 {
-	if (!lazy_coherency && req->ring->seqno_barrier)
-		req->ring->seqno_barrier(req->ring);
 	return i915_seqno_passed(req->ring->get_seqno(req->ring),
 				 req->seqno);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fa0cf6c9f4d0..f3c1e268f614 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1173,12 +1173,12 @@  static bool __i915_spin_request(struct drm_i915_gem_request *req,
 	 */
 
 	/* Only spin if we know the GPU is processing this request */
-	if (!i915_gem_request_started(req, true))
+	if (!i915_gem_request_started(req))
 		return false;
 
 	timeout = local_clock_us(&cpu) + 5;
 	do {
-		if (i915_gem_request_completed(req, true))
+		if (i915_gem_request_completed(req))
 			return true;
 
 		if (signal_pending_state(state, wait->task))
@@ -1230,7 +1230,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 	if (list_empty(&req->list))
 		return 0;
 
-	if (i915_gem_request_completed(req, true))
+	if (i915_gem_request_completed(req))
 		return 0;
 
 	timeout_remain = MAX_SCHEDULE_TIMEOUT;
@@ -1299,7 +1299,10 @@  wakeup:		set_task_state(wait.task, state);
 		 * but it is easier and safer to do it every time the waiter
 		 * is woken.
 		 */
-		if (i915_gem_request_completed(req, false))
+		if (req->ring->seqno_barrier)
+			req->ring->seqno_barrier(req->ring);
+
+		if (i915_gem_request_completed(req))
 			break;
 
 		/* We need to check whether any gpu reset happened in between
@@ -2731,8 +2734,11 @@  i915_gem_find_active_request(struct intel_engine_cs *ring)
 {
 	struct drm_i915_gem_request *request;
 
+	if (ring->seqno_barrier)
+		ring->seqno_barrier(ring);
+
 	list_for_each_entry(request, &ring->request_list, list) {
-		if (i915_gem_request_completed(request, false))
+		if (i915_gem_request_completed(request))
 			continue;
 
 		return request;
@@ -2873,7 +2879,7 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 					   struct drm_i915_gem_request,
 					   list);
 
-		if (!i915_gem_request_completed(request, true))
+		if (!i915_gem_request_completed(request))
 			break;
 
 		i915_gem_request_retire(request);
@@ -2897,7 +2903,7 @@  i915_gem_retire_requests_ring(struct intel_engine_cs *ring)
 	}
 
 	if (unlikely(ring->trace_irq_req &&
-		     i915_gem_request_completed(ring->trace_irq_req, true))) {
+		     i915_gem_request_completed(ring->trace_irq_req))) {
 		ring->irq_put(ring);
 		i915_gem_request_assign(&ring->trace_irq_req, NULL);
 	}
@@ -3007,7 +3013,7 @@  i915_gem_object_flush_active(struct drm_i915_gem_object *obj)
 		if (list_empty(&req->list))
 			goto retire;
 
-		if (i915_gem_request_completed(req, true)) {
+		if (i915_gem_request_completed(req)) {
 			__i915_gem_request_retire__upto(req);
 retire:
 			i915_gem_object_retire__read(obj, i);
@@ -3116,7 +3122,7 @@  __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 	if (to == from)
 		return 0;
 
-	if (i915_gem_request_completed(from_req, true))
+	if (i915_gem_request_completed(from_req))
 		return 0;
 
 	if (!i915_semaphore_is_enabled(obj->base.dev)) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 875bdf814d73..ffcdc2c631e1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11459,7 +11459,7 @@  static bool __intel_pageflip_stall_check(struct drm_device *dev,
 
 	if (work->flip_ready_vblank == 0) {
 		if (work->flip_queued_req &&
-		    !i915_gem_request_completed(work->flip_queued_req, true))
+		    !i915_gem_request_completed(work->flip_queued_req))
 			return false;
 
 		work->flip_ready_vblank = drm_crtc_vblank_count(crtc);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 99f2642fd5df..570628628a90 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -7188,7 +7188,7 @@  static void __intel_rps_boost_work(struct work_struct *work)
 	struct request_boost *boost = container_of(work, struct request_boost, work);
 	struct drm_i915_gem_request *req = boost->req;
 
-	if (!i915_gem_request_completed(req, true))
+	if (!i915_gem_request_completed(req))
 		gen6_rps_boost(to_i915(req->ring->dev), NULL,
 			       req->emitted_jiffies);
 
@@ -7204,7 +7204,7 @@  void intel_queue_rps_boost_for_request(struct drm_device *dev,
 	if (req == NULL || INTEL_INFO(dev)->gen < 6)
 		return;
 
-	if (i915_gem_request_completed(req, true))
+	if (i915_gem_request_completed(req))
 		return;
 
 	boost = kmalloc(sizeof(*boost), GFP_ATOMIC);