Message ID | 1449833608-22125-18-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi, On 11/12/15 11:33, Chris Wilson wrote: > Now that we have split out the seqno-barrier from the > engine->get_seqno() callback itself, we can move the users of the > seqno-barrier to the required callsites simplifying the common code and > making the required workaround handling much more explicit. What bothers me about this patch, and the one preceding it, is that I don't see a tangible improvement for the programmer who still has to know when to read the seqno and when to "read it harder, read for real". Barrier in this sense has a relation to the state of things but somehow feels too low level to me when used from the code. But to be fair I am not sure how to better define it. Would ring->get_seqno paired with ring->read_seqno perhaps make sense? Implementation for ring->read_seqno would just be a flush followed by ring->get_seqno then. Or maybe keep the barrier and add ring->read_seqno which would be ring->seqno_barrier + ring_get_seqno? Regards, Tvrtko > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- > drivers/gpu/drm/i915/i915_drv.h | 10 ++-------- > drivers/gpu/drm/i915/i915_gem.c | 24 +++++++++++++++--------- > drivers/gpu/drm/i915/intel_display.c | 2 +- > drivers/gpu/drm/i915/intel_pm.c | 4 ++-- > 5 files changed, 22 insertions(+), 22 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 6344fe69ab82..8860dec36aae 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -601,7 +601,7 @@ static int i915_gem_pageflip_info(struct seq_file *m, void *data) > i915_gem_request_get_seqno(work->flip_queued_req), > dev_priv->next_seqno, > ring->get_seqno(ring), > - i915_gem_request_completed(work->flip_queued_req, true)); > + i915_gem_request_completed(work->flip_queued_req)); > } else > seq_printf(m, "Flip not associated with any ring\n"); > seq_printf(m, "Flip queued on frame %d, (was ready on frame %d), now %d\n", > @@ -1353,8 +1353,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) > intel_runtime_pm_get(dev_priv); > > for_each_ring(ring, dev_priv, i) { > - seqno[i] = ring->get_seqno(ring); > acthd[i] = intel_ring_get_active_head(ring); > + seqno[i] = ring->get_seqno(ring); > } > > intel_runtime_pm_put(dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index ff83f148658f..d099e960f9b8 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2978,20 +2978,14 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2) > return (int32_t)(seq1 - seq2) >= 0; > } > > -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req, > - bool lazy_coherency) > +static inline bool i915_gem_request_started(struct drm_i915_gem_request *req) > { > - if (!lazy_coherency && req->ring->seqno_barrier) > - req->ring->seqno_barrier(req->ring); > return i915_seqno_passed(req->ring->get_seqno(req->ring), > req->previous_seqno); > } > > -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req, > - bool lazy_coherency) > +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req) > { > - if (!lazy_coherency && req->ring->seqno_barrier) > - req->ring->seqno_barrier(req->ring); > return i915_seqno_passed(req->ring->get_seqno(req->ring), > req->seqno); > } > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index fa0cf6c9f4d0..f3c1e268f614 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -1173,12 +1173,12 @@ static bool __i915_spin_request(struct drm_i915_gem_request *req, > */ > > /* Only spin if we know the GPU is processing this request */ > - if (!i915_gem_request_started(req, true)) > + if (!i915_gem_request_started(req)) > return false; > > timeout = local_clock_us(&cpu) + 5; > do { > - if (i915_gem_request_completed(req, true)) > + if (i915_gem_request_completed(req)) > return true; > > if (signal_pending_state(state, wait->task)) > @@ -1230,7 +1230,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req, > if (list_empty(&req->list)) > return 0; > > - if (i915_gem_request_completed(req, true)) > + if (i915_gem_request_completed(req)) > return 0; > > timeout_remain = MAX_SCHEDULE_TIMEOUT; > @@ -1299,7 +1299,10 @@ wakeup: set_task_state(wait.task, state); > * but it is easier and safer to do it every time the waiter > * is woken. > */ > - if (i915_gem_request_completed(req, false)) > + if (req->ring->seqno_barrier) > + req->ring->seqno_barrier(req->ring); > + > + if (i915_gem_request_completed(req)) > break; > > /* We need to check whether any gpu reset happened in between > @@ -2731,8 +2734,11 @@ i915_gem_find_active_request(struct intel_engine_cs *ring) > { > struct drm_i915_gem_request *request; > > + if (ring->seqno_barrier) > + ring->seqno_barrier(ring); > + > list_for_each_entry(request, &ring->request_list, list) { > - if (i915_gem_request_completed(request, false)) > + if (i915_gem_request_completed(request)) > continue; > > return request; > @@ -2873,7 +2879,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring) > struct drm_i915_gem_request, > list); > > - if (!i915_gem_request_completed(request, true)) > + if (!i915_gem_request_completed(request)) > break; > > i915_gem_request_retire(request); > @@ -2897,7 +2903,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring) > } > > if (unlikely(ring->trace_irq_req && > - i915_gem_request_completed(ring->trace_irq_req, true))) { > + i915_gem_request_completed(ring->trace_irq_req))) { > ring->irq_put(ring); > i915_gem_request_assign(&ring->trace_irq_req, NULL); > } > @@ -3007,7 +3013,7 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj) > if (list_empty(&req->list)) > goto retire; > > - if (i915_gem_request_completed(req, true)) { > + if (i915_gem_request_completed(req)) { > __i915_gem_request_retire__upto(req); > retire: > i915_gem_object_retire__read(obj, i); > @@ -3116,7 +3122,7 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj, > if (to == from) > return 0; > > - if (i915_gem_request_completed(from_req, true)) > + if (i915_gem_request_completed(from_req)) > return 0; > > if (!i915_semaphore_is_enabled(obj->base.dev)) { > diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c > index 875bdf814d73..ffcdc2c631e1 100644 > --- a/drivers/gpu/drm/i915/intel_display.c > +++ b/drivers/gpu/drm/i915/intel_display.c > @@ -11459,7 +11459,7 @@ static bool __intel_pageflip_stall_check(struct drm_device *dev, > > if (work->flip_ready_vblank == 0) { > if (work->flip_queued_req && > - !i915_gem_request_completed(work->flip_queued_req, true)) > + !i915_gem_request_completed(work->flip_queued_req)) > return false; > > work->flip_ready_vblank = drm_crtc_vblank_count(crtc); > diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c > index 99f2642fd5df..570628628a90 100644 > --- a/drivers/gpu/drm/i915/intel_pm.c > +++ b/drivers/gpu/drm/i915/intel_pm.c > @@ -7188,7 +7188,7 @@ static void __intel_rps_boost_work(struct work_struct *work) > struct request_boost *boost = container_of(work, struct request_boost, work); > struct drm_i915_gem_request *req = boost->req; > > - if (!i915_gem_request_completed(req, true)) > + if (!i915_gem_request_completed(req)) > gen6_rps_boost(to_i915(req->ring->dev), NULL, > req->emitted_jiffies); > > @@ -7204,7 +7204,7 @@ void intel_queue_rps_boost_for_request(struct drm_device *dev, > if (req == NULL || INTEL_INFO(dev)->gen < 6) > return; > > - if (i915_gem_request_completed(req, true)) > + if (i915_gem_request_completed(req)) > return; > > boost = kmalloc(sizeof(*boost), GFP_ATOMIC); >
On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: > > Hi, > > On 11/12/15 11:33, Chris Wilson wrote: > >Now that we have split out the seqno-barrier from the > >engine->get_seqno() callback itself, we can move the users of the > >seqno-barrier to the required callsites simplifying the common code and > >making the required workaround handling much more explicit. > > What bothers me about this patch, and the one preceding it, is that > I don't see a tangible improvement for the programmer who still has > to know when to read the seqno and when to "read it harder, read for > real". In earlier patches, I called it irq_barrier. It's not reading it harder. It's just that there is a ordering issue with receiving an interrupt and the seqno write being visible. > Barrier in this sense has a relation to the state of things but > somehow feels too low level to me when used from the code. But to be > fair I am not sure how to better define it. > > Would ring->get_seqno paired with ring->read_seqno perhaps make > sense? Implementation for ring->read_seqno would just be a flush > followed by ring->get_seqno then. Or maybe keep the barrier and add > ring->read_seqno which would be ring->seqno_barrier + > ring_get_seqno? No. -Chris
On 14/12/15 15:11, Chris Wilson wrote: > On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: >> >> Hi, >> >> On 11/12/15 11:33, Chris Wilson wrote: >>> Now that we have split out the seqno-barrier from the >>> engine->get_seqno() callback itself, we can move the users of the >>> seqno-barrier to the required callsites simplifying the common code and >>> making the required workaround handling much more explicit. >> >> What bothers me about this patch, and the one preceding it, is that >> I don't see a tangible improvement for the programmer who still has >> to know when to read the seqno and when to "read it harder, read for >> real". > > In earlier patches, I called it irq_barrier. > > It's not reading it harder. It's just that there is a ordering issue > with receiving an interrupt and the seqno write being visible. > >> Barrier in this sense has a relation to the state of things but >> somehow feels too low level to me when used from the code. But to be >> fair I am not sure how to better define it. >> >> Would ring->get_seqno paired with ring->read_seqno perhaps make >> sense? Implementation for ring->read_seqno would just be a flush >> followed by ring->get_seqno then. Or maybe keep the barrier and add >> ring->read_seqno which would be ring->seqno_barrier + >> ring_get_seqno? > > No. > -Chris We could instead put the knowledge about whether and how to read "for real" inside the read-the-seqno function. For example: struct intel_engine_cs { ... u32 last_seqno_seen; } u32 intel_ring_read_seqno(struct intel_engine_cs *engine) { // First try simple read u32 seqno = intel_ring_get_seqno(engine); if (seqno == engine->last_seqno_seen) { // Do additional flush, then try again engine->seqno_barrier(engine); seqno = intel_ring_get_seqno(engine); } engine->last_seqno_seen = seqno; return seqno; } Then callers don't have to know anything about coherency; they can just assume that they will automatically get the latest value. In the presumably common case where the value *has* been updated, and the cache has noted the update and invalidated the old local value, the first read will successfully find the updated seqno and return quickly. Then we only do the extra work in the case where there would otherwise appear to be nothing to do (i.e. when we would sleep, or spin, or otherwise wait, if this function were to return the same value as last time). .Dave.
On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: > On 14/12/15 15:11, Chris Wilson wrote: > >On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: > >> > >>Hi, > >> > >>On 11/12/15 11:33, Chris Wilson wrote: > >>>Now that we have split out the seqno-barrier from the > >>>engine->get_seqno() callback itself, we can move the users of the > >>>seqno-barrier to the required callsites simplifying the common code and > >>>making the required workaround handling much more explicit. > >> > >>What bothers me about this patch, and the one preceding it, is that > >>I don't see a tangible improvement for the programmer who still has > >>to know when to read the seqno and when to "read it harder, read for > >>real". > > > >In earlier patches, I called it irq_barrier. > > > >It's not reading it harder. It's just that there is a ordering issue > >with receiving an interrupt and the seqno write being visible. > > > >>Barrier in this sense has a relation to the state of things but > >>somehow feels too low level to me when used from the code. But to be > >>fair I am not sure how to better define it. > >> > >>Would ring->get_seqno paired with ring->read_seqno perhaps make > >>sense? Implementation for ring->read_seqno would just be a flush > >>followed by ring->get_seqno then. Or maybe keep the barrier and add > >>ring->read_seqno which would be ring->seqno_barrier + > >>ring_get_seqno? > > > >No. > >-Chris > > We could instead put the knowledge about whether and how to read > "for real" inside the read-the-seqno function. For example: You do appreciate the irony that you are on the reviewer list for patches that do that? http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 There is just one place that we need the extra work, after the interrupt. All other places only care about the current value of the seqno in the CPU cache. -Chris
On 04/01/16 11:26, Chris Wilson wrote: > On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: >> On 14/12/15 15:11, Chris Wilson wrote: >>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: >>>> >>>> Hi, >>>> >>>> On 11/12/15 11:33, Chris Wilson wrote: >>>>> Now that we have split out the seqno-barrier from the >>>>> engine->get_seqno() callback itself, we can move the users of the >>>>> seqno-barrier to the required callsites simplifying the common code and >>>>> making the required workaround handling much more explicit. >>>> >>>> What bothers me about this patch, and the one preceding it, is that >>>> I don't see a tangible improvement for the programmer who still has >>>> to know when to read the seqno and when to "read it harder, read for >>>> real". >>> >>> In earlier patches, I called it irq_barrier. >>> >>> It's not reading it harder. It's just that there is a ordering issue >>> with receiving an interrupt and the seqno write being visible. >>> >>>> Barrier in this sense has a relation to the state of things but >>>> somehow feels too low level to me when used from the code. But to be >>>> fair I am not sure how to better define it. >>>> >>>> Would ring->get_seqno paired with ring->read_seqno perhaps make >>>> sense? Implementation for ring->read_seqno would just be a flush >>>> followed by ring->get_seqno then. Or maybe keep the barrier and add >>>> ring->read_seqno which would be ring->seqno_barrier + >>>> ring_get_seqno? >>> >>> No. >>> -Chris >> >> We could instead put the knowledge about whether and how to read >> "for real" inside the read-the-seqno function. For example: > > You do appreciate the irony that you are on the reviewer list for patches > that do that? > > http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 No, I haven't got as far as that one, since it was posted over a week after the message at the head of this thread. Anyway, I still can't see in that patch anything equivalent to what I described above. > There is just one place that we need the extra work, after the > interrupt. All other places only care about the current value of the > seqno in the CPU cache. > -Chris So, we could instead have a per-engine flag which says, interrupt has happened, next reader should expect an update (and try hard to see one)? Flow would be: IRQ->handler sets flag, wakes first waiter, clears IRQs waiter wakes up reads seqno (read fn sees flag => coherent, clear flag if new) checks whether (only) its request is completed not complete: enable IRQ, sleep seqno match: dequeue request, enable IRQ, process completion match+: dequeue request, wake next, process completion where 'process completion' involves updating the request state and waking all /additional/ waiters on the same request, whereas 'wake next' refers to threads waiting on the /next/ request. Is that what your patch is trying to achieve? It's a bit hard to tell with the seqno-read-optimisation and irq-dispatch changes mixed in with the r-b-tree and all the other things in this sequence. I think it would be easier to understand if some of the more obvious improvements (such as 18/32, 20/32, 24/32) were pushed in earlier, so that the code is as clear as possible before the patches that actually change the way things work are applied. And all the reset-related patches could be later, as that's an area with some subtlety to it. .Dave.
On Mon, Jan 04, 2016 at 01:02:25PM +0000, Dave Gordon wrote: > On 04/01/16 11:26, Chris Wilson wrote: > >On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: > >>On 14/12/15 15:11, Chris Wilson wrote: > >>>On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: > >>>> > >>>>Hi, > >>>> > >>>>On 11/12/15 11:33, Chris Wilson wrote: > >>>>>Now that we have split out the seqno-barrier from the > >>>>>engine->get_seqno() callback itself, we can move the users of the > >>>>>seqno-barrier to the required callsites simplifying the common code and > >>>>>making the required workaround handling much more explicit. > >>>> > >>>>What bothers me about this patch, and the one preceding it, is that > >>>>I don't see a tangible improvement for the programmer who still has > >>>>to know when to read the seqno and when to "read it harder, read for > >>>>real". > >>> > >>>In earlier patches, I called it irq_barrier. > >>> > >>>It's not reading it harder. It's just that there is a ordering issue > >>>with receiving an interrupt and the seqno write being visible. > >>> > >>>>Barrier in this sense has a relation to the state of things but > >>>>somehow feels too low level to me when used from the code. But to be > >>>>fair I am not sure how to better define it. > >>>> > >>>>Would ring->get_seqno paired with ring->read_seqno perhaps make > >>>>sense? Implementation for ring->read_seqno would just be a flush > >>>>followed by ring->get_seqno then. Or maybe keep the barrier and add > >>>>ring->read_seqno which would be ring->seqno_barrier + > >>>>ring_get_seqno? > >>> > >>>No. > >>>-Chris > >> > >>We could instead put the knowledge about whether and how to read > >>"for real" inside the read-the-seqno function. For example: > > > >You do appreciate the irony that you are on the reviewer list for patches > >that do that? > > > >http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 > > No, I haven't got as far as that one, since it was posted over a > week after the message at the head of this thread. Anyway, I still > can't see in that patch anything equivalent to what I described > above. > > >There is just one place that we need the extra work, after the > >interrupt. All other places only care about the current value of the > >seqno in the CPU cache. > >-Chris > > So, we could instead have a per-engine flag which says, interrupt > has happened, next reader should expect an update (and try hard to > see one)? Go back and read. That patch adds the commentary that should explain what needs to be done and where, continue on in the series you can see just that micro-optimisation along with the required barriers. -Chris
On 04/01/16 13:02, Dave Gordon wrote: > On 04/01/16 11:26, Chris Wilson wrote: >> On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: >>> On 14/12/15 15:11, Chris Wilson wrote: >>>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: >>>>> >>>>> Hi, >>>>> >>>>> On 11/12/15 11:33, Chris Wilson wrote: >>>>>> Now that we have split out the seqno-barrier from the >>>>>> engine->get_seqno() callback itself, we can move the users of the >>>>>> seqno-barrier to the required callsites simplifying the common >>>>>> code and >>>>>> making the required workaround handling much more explicit. >>>>> >>>>> What bothers me about this patch, and the one preceding it, is that >>>>> I don't see a tangible improvement for the programmer who still has >>>>> to know when to read the seqno and when to "read it harder, read for >>>>> real". >>>> >>>> In earlier patches, I called it irq_barrier. >>>> >>>> It's not reading it harder. It's just that there is a ordering issue >>>> with receiving an interrupt and the seqno write being visible. >>>> >>>>> Barrier in this sense has a relation to the state of things but >>>>> somehow feels too low level to me when used from the code. But to be >>>>> fair I am not sure how to better define it. >>>>> >>>>> Would ring->get_seqno paired with ring->read_seqno perhaps make >>>>> sense? Implementation for ring->read_seqno would just be a flush >>>>> followed by ring->get_seqno then. Or maybe keep the barrier and add >>>>> ring->read_seqno which would be ring->seqno_barrier + >>>>> ring_get_seqno? >>>> >>>> No. >>>> -Chris >>> >>> We could instead put the knowledge about whether and how to read >>> "for real" inside the read-the-seqno function. For example: >> >> You do appreciate the irony that you are on the reviewer list for patches >> that do that? >> >> http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 > > No, I haven't got as far as that one, since it was posted over a week > after the message at the head of this thread. Anyway, I still can't see > in that patch anything equivalent to what I described above. Oh, I spotted what you meant, but it's not in /that/ patch (which was a version of PATCH 15/32 (Slaughter the thundering i915_wait_request herd) it's in PATCH 19/32 (Check the CPU cached value of seqno after waking the waiter). Even so, it's not at the same level of code structure; I was suggesting pushing it all the way down, because __i915_wait_request() and/or i915_gem_request_completed() aren't the only functions that use it. .Dave.
On Mon, Jan 04, 2016 at 02:09:53PM +0000, Dave Gordon wrote: > On 04/01/16 13:02, Dave Gordon wrote: > >On 04/01/16 11:26, Chris Wilson wrote: > >>On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: > >>>On 14/12/15 15:11, Chris Wilson wrote: > >>>>On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: > >>>>> > >>>>>Hi, > >>>>> > >>>>>On 11/12/15 11:33, Chris Wilson wrote: > >>>>>>Now that we have split out the seqno-barrier from the > >>>>>>engine->get_seqno() callback itself, we can move the users of the > >>>>>>seqno-barrier to the required callsites simplifying the common > >>>>>>code and > >>>>>>making the required workaround handling much more explicit. > >>>>> > >>>>>What bothers me about this patch, and the one preceding it, is that > >>>>>I don't see a tangible improvement for the programmer who still has > >>>>>to know when to read the seqno and when to "read it harder, read for > >>>>>real". > >>>> > >>>>In earlier patches, I called it irq_barrier. > >>>> > >>>>It's not reading it harder. It's just that there is a ordering issue > >>>>with receiving an interrupt and the seqno write being visible. > >>>> > >>>>>Barrier in this sense has a relation to the state of things but > >>>>>somehow feels too low level to me when used from the code. But to be > >>>>>fair I am not sure how to better define it. > >>>>> > >>>>>Would ring->get_seqno paired with ring->read_seqno perhaps make > >>>>>sense? Implementation for ring->read_seqno would just be a flush > >>>>>followed by ring->get_seqno then. Or maybe keep the barrier and add > >>>>>ring->read_seqno which would be ring->seqno_barrier + > >>>>>ring_get_seqno? > >>>> > >>>>No. > >>>>-Chris > >>> > >>>We could instead put the knowledge about whether and how to read > >>>"for real" inside the read-the-seqno function. For example: > >> > >>You do appreciate the irony that you are on the reviewer list for patches > >>that do that? > >> > >>http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 > > > >No, I haven't got as far as that one, since it was posted over a week > >after the message at the head of this thread. Anyway, I still can't see > >in that patch anything equivalent to what I described above. > > Oh, I spotted what you meant, but it's not in /that/ patch (which > was a version of PATCH 15/32 (Slaughter the thundering > i915_wait_request herd) > it's in PATCH 19/32 (Check the CPU cached value of seqno after > waking the waiter). > > Even so, it's not at the same level of code structure; I was > suggesting pushing it all the way down, because > __i915_wait_request() and/or i915_gem_request_completed() aren't the > only functions that use it. No. I am arguing that there should be precisely one piece of code responsible for seqno-vs-interrupt ordering. Everywhere else should not have to worry about that interrupts may be asserted before the HWS write is posted. -Chris
On 04/01/16 14:20, Chris Wilson wrote: > On Mon, Jan 04, 2016 at 02:09:53PM +0000, Dave Gordon wrote: >> On 04/01/16 13:02, Dave Gordon wrote: >>> On 04/01/16 11:26, Chris Wilson wrote: >>>> On Mon, Jan 04, 2016 at 11:16:04AM +0000, Dave Gordon wrote: >>>>> On 14/12/15 15:11, Chris Wilson wrote: >>>>>> On Mon, Dec 14, 2015 at 02:59:30PM +0000, Tvrtko Ursulin wrote: >>>>>>> >>>>>>> Hi, >>>>>>> >>>>>>> On 11/12/15 11:33, Chris Wilson wrote: >>>>>>>> Now that we have split out the seqno-barrier from the >>>>>>>> engine->get_seqno() callback itself, we can move the users of the >>>>>>>> seqno-barrier to the required callsites simplifying the common >>>>>>>> code and >>>>>>>> making the required workaround handling much more explicit. >>>>>>> >>>>>>> What bothers me about this patch, and the one preceding it, is that >>>>>>> I don't see a tangible improvement for the programmer who still has >>>>>>> to know when to read the seqno and when to "read it harder, read for >>>>>>> real". >>>>>> >>>>>> In earlier patches, I called it irq_barrier. >>>>>> >>>>>> It's not reading it harder. It's just that there is a ordering issue >>>>>> with receiving an interrupt and the seqno write being visible. >>>>>> >>>>>>> Barrier in this sense has a relation to the state of things but >>>>>>> somehow feels too low level to me when used from the code. But to be >>>>>>> fair I am not sure how to better define it. >>>>>>> >>>>>>> Would ring->get_seqno paired with ring->read_seqno perhaps make >>>>>>> sense? Implementation for ring->read_seqno would just be a flush >>>>>>> followed by ring->get_seqno then. Or maybe keep the barrier and add >>>>>>> ring->read_seqno which would be ring->seqno_barrier + >>>>>>> ring_get_seqno? >>>>>> >>>>>> No. >>>>>> -Chris >>>>> >>>>> We could instead put the knowledge about whether and how to read >>>>> "for real" inside the read-the-seqno function. For example: >>>> >>>> You do appreciate the irony that you are on the reviewer list for patches >>>> that do that? >>>> >>>> http://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=breadcrumbs&id=34409f2d965001d7d63f21a1c5339b07eed6af34 >>> >>> No, I haven't got as far as that one, since it was posted over a week >>> after the message at the head of this thread. Anyway, I still can't see >>> in that patch anything equivalent to what I described above. >> >> Oh, I spotted what you meant, but it's not in /that/ patch >> (which was a version of PATCH 15/32 "Slaughter the thundering >> i915_wait_request herd") it's in PATCH 19/32 "Check the CPU >> cached value of seqno after waking the waiter". >> >> Even so, it's not at the same level of code structure; I was >> suggesting pushing it all the way down, because >> __i915_wait_request() and/or i915_gem_request_completed() aren't the >> only functions that use it. > > No. I am arguing that there should be precisely one piece of code > responsible for seqno-vs-interrupt ordering. Everywhere else should not > have to worry about that interrupts may be asserted before the HWS write > is posted. > -Chris That's what /I/ said. So it should be inside the /lowest-level/ function, the one that hands back a h/w sequence number to other code that doesn't care how it was obtained; in other words, either intel_ring_get_seqno() or a wrapper round it that incorporates the check-and-flush, if you still want the option of looking only at the cached copy. Whereas you put the read-barrier-read logic in __i915_wait_request() but that's too high a level, and I don't think that's the only caller where it could be advantageous to conditionally update the cached seqno from the HWSP; and the fact of reading the HWSP is hidden inside the call to i915_gem_request(), which is working at an entirely different level of abstraction. .Dave.
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 6344fe69ab82..8860dec36aae 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -601,7 +601,7 @@ static int i915_gem_pageflip_info(struct seq_file *m, void *data) i915_gem_request_get_seqno(work->flip_queued_req), dev_priv->next_seqno, ring->get_seqno(ring), - i915_gem_request_completed(work->flip_queued_req, true)); + i915_gem_request_completed(work->flip_queued_req)); } else seq_printf(m, "Flip not associated with any ring\n"); seq_printf(m, "Flip queued on frame %d, (was ready on frame %d), now %d\n", @@ -1353,8 +1353,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) intel_runtime_pm_get(dev_priv); for_each_ring(ring, dev_priv, i) { - seqno[i] = ring->get_seqno(ring); acthd[i] = intel_ring_get_active_head(ring); + seqno[i] = ring->get_seqno(ring); } intel_runtime_pm_put(dev_priv); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index ff83f148658f..d099e960f9b8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2978,20 +2978,14 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2) return (int32_t)(seq1 - seq2) >= 0; } -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req, - bool lazy_coherency) +static inline bool i915_gem_request_started(struct drm_i915_gem_request *req) { - if (!lazy_coherency && req->ring->seqno_barrier) - req->ring->seqno_barrier(req->ring); return i915_seqno_passed(req->ring->get_seqno(req->ring), req->previous_seqno); } -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req, - bool lazy_coherency) +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req) { - if (!lazy_coherency && req->ring->seqno_barrier) - req->ring->seqno_barrier(req->ring); return i915_seqno_passed(req->ring->get_seqno(req->ring), req->seqno); } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index fa0cf6c9f4d0..f3c1e268f614 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1173,12 +1173,12 @@ static bool __i915_spin_request(struct drm_i915_gem_request *req, */ /* Only spin if we know the GPU is processing this request */ - if (!i915_gem_request_started(req, true)) + if (!i915_gem_request_started(req)) return false; timeout = local_clock_us(&cpu) + 5; do { - if (i915_gem_request_completed(req, true)) + if (i915_gem_request_completed(req)) return true; if (signal_pending_state(state, wait->task)) @@ -1230,7 +1230,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req, if (list_empty(&req->list)) return 0; - if (i915_gem_request_completed(req, true)) + if (i915_gem_request_completed(req)) return 0; timeout_remain = MAX_SCHEDULE_TIMEOUT; @@ -1299,7 +1299,10 @@ wakeup: set_task_state(wait.task, state); * but it is easier and safer to do it every time the waiter * is woken. */ - if (i915_gem_request_completed(req, false)) + if (req->ring->seqno_barrier) + req->ring->seqno_barrier(req->ring); + + if (i915_gem_request_completed(req)) break; /* We need to check whether any gpu reset happened in between @@ -2731,8 +2734,11 @@ i915_gem_find_active_request(struct intel_engine_cs *ring) { struct drm_i915_gem_request *request; + if (ring->seqno_barrier) + ring->seqno_barrier(ring); + list_for_each_entry(request, &ring->request_list, list) { - if (i915_gem_request_completed(request, false)) + if (i915_gem_request_completed(request)) continue; return request; @@ -2873,7 +2879,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring) struct drm_i915_gem_request, list); - if (!i915_gem_request_completed(request, true)) + if (!i915_gem_request_completed(request)) break; i915_gem_request_retire(request); @@ -2897,7 +2903,7 @@ i915_gem_retire_requests_ring(struct intel_engine_cs *ring) } if (unlikely(ring->trace_irq_req && - i915_gem_request_completed(ring->trace_irq_req, true))) { + i915_gem_request_completed(ring->trace_irq_req))) { ring->irq_put(ring); i915_gem_request_assign(&ring->trace_irq_req, NULL); } @@ -3007,7 +3013,7 @@ i915_gem_object_flush_active(struct drm_i915_gem_object *obj) if (list_empty(&req->list)) goto retire; - if (i915_gem_request_completed(req, true)) { + if (i915_gem_request_completed(req)) { __i915_gem_request_retire__upto(req); retire: i915_gem_object_retire__read(obj, i); @@ -3116,7 +3122,7 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj, if (to == from) return 0; - if (i915_gem_request_completed(from_req, true)) + if (i915_gem_request_completed(from_req)) return 0; if (!i915_semaphore_is_enabled(obj->base.dev)) { diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 875bdf814d73..ffcdc2c631e1 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -11459,7 +11459,7 @@ static bool __intel_pageflip_stall_check(struct drm_device *dev, if (work->flip_ready_vblank == 0) { if (work->flip_queued_req && - !i915_gem_request_completed(work->flip_queued_req, true)) + !i915_gem_request_completed(work->flip_queued_req)) return false; work->flip_ready_vblank = drm_crtc_vblank_count(crtc); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 99f2642fd5df..570628628a90 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -7188,7 +7188,7 @@ static void __intel_rps_boost_work(struct work_struct *work) struct request_boost *boost = container_of(work, struct request_boost, work); struct drm_i915_gem_request *req = boost->req; - if (!i915_gem_request_completed(req, true)) + if (!i915_gem_request_completed(req)) gen6_rps_boost(to_i915(req->ring->dev), NULL, req->emitted_jiffies); @@ -7204,7 +7204,7 @@ void intel_queue_rps_boost_for_request(struct drm_device *dev, if (req == NULL || INTEL_INFO(dev)->gen < 6) return; - if (i915_gem_request_completed(req, true)) + if (i915_gem_request_completed(req)) return; boost = kmalloc(sizeof(*boost), GFP_ATOMIC);
Now that we have split out the seqno-barrier from the engine->get_seqno() callback itself, we can move the users of the seqno-barrier to the required callsites simplifying the common code and making the required workaround handling much more explicit. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- drivers/gpu/drm/i915/i915_drv.h | 10 ++-------- drivers/gpu/drm/i915/i915_gem.c | 24 +++++++++++++++--------- drivers/gpu/drm/i915/intel_display.c | 2 +- drivers/gpu/drm/i915/intel_pm.c | 4 ++-- 5 files changed, 22 insertions(+), 22 deletions(-)