Message ID | 20190117143519.16086-20-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/23] drm/i915: Make all GPU resets atomic | expand |
On 17/01/2019 14:35, Chris Wilson wrote: > Now that we have allocated ourselves a cacheline to store a breadcrumb, > we can emit a write from the GPU into the timeline's HWSP of the > per-context seqno as we complete each request. This drops the mirroring > of the per-engine HWSP and allows each context to operate independently. > We do not need to unwind the per-context timeline, and so requests are > always consistent with the timeline breadcrumb, greatly simplifying the > completion checks as we no longer need to be concerned about the > global_seqno changing mid check. > > At this point, we are emitting both per-context and global seqno and > still using the single per-engine execution timeline for resolving > interrupts. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_gem.c | 2 +- > drivers/gpu/drm/i915/i915_request.c | 2 +- > drivers/gpu/drm/i915/i915_request.h | 27 ++---- > drivers/gpu/drm/i915/i915_reset.c | 1 + > drivers/gpu/drm/i915/i915_vma.h | 7 ++ > drivers/gpu/drm/i915/intel_lrc.c | 32 ++++--- > drivers/gpu/drm/i915/intel_ringbuffer.c | 91 ++++++++++++++------ > drivers/gpu/drm/i915/selftests/mock_engine.c | 8 +- > 8 files changed, 109 insertions(+), 61 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index 3c6091021290..a5bd51987c0d 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -2890,7 +2890,7 @@ i915_gem_find_active_request(struct intel_engine_cs *engine) > */ > spin_lock_irqsave(&engine->timeline.lock, flags); > list_for_each_entry(request, &engine->timeline.requests, link) { > - if (__i915_request_completed(request, request->global_seqno)) > + if (i915_request_completed(request)) > continue; > > active = request; > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 7d068c406a49..0d7b71aff28f 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -614,7 +614,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) > rq->ring = ce->ring; > rq->timeline = ce->ring->timeline; > GEM_BUG_ON(rq->timeline == &engine->timeline); > - rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX]; > + rq->hwsp_seqno = rq->timeline->hwsp_seqno; > > spin_lock_init(&rq->lock); > dma_fence_init(&rq->fence, > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h > index 4dd22dadf5ce..a16a3b7f7d92 100644 > --- a/drivers/gpu/drm/i915/i915_request.h > +++ b/drivers/gpu/drm/i915/i915_request.h > @@ -324,32 +324,21 @@ static inline u32 hwsp_seqno(const struct i915_request *rq) > */ > static inline bool i915_request_started(const struct i915_request *rq) > { > - u32 seqno; > - > - seqno = i915_request_global_seqno(rq); > - if (!seqno) /* not yet submitted to HW */ > - return false; > - > - return i915_seqno_passed(hwsp_seqno(rq), seqno - 1); > + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1); > } > > -static inline bool > -__i915_request_completed(const struct i915_request *rq, u32 seqno) > +static inline bool i915_request_completed(const struct i915_request *rq) > { > - GEM_BUG_ON(!seqno); > - return i915_seqno_passed(hwsp_seqno(rq), seqno) && > - seqno == i915_request_global_seqno(rq); > + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno); > } > > -static inline bool i915_request_completed(const struct i915_request *rq) > +static inline void i915_request_fake_complete(const struct i915_request *rq) I don't like this name. force_complete? force_hwsp_complete? Or turn it around, i915_hwsp_write(rq->hwsp_seqno, rq->fence.seqno)? Now it is beginning to remind of intel_write_status_page. :) > { > - u32 seqno; > - > - seqno = i915_request_global_seqno(rq); > - if (!seqno) > - return false; > + /* Don't allow ourselves to accidentally go backwards. */ > + if (i915_request_completed(rq)) > + return; > > - return __i915_request_completed(rq, seqno); > + WRITE_ONCE(*(u32 *)rq->hwsp_seqno, rq->fence.seqno); > } > > void i915_retire_requests(struct drm_i915_private *i915); > diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c > index 12e5a2bc825c..eff76558b958 100644 > --- a/drivers/gpu/drm/i915/i915_reset.c > +++ b/drivers/gpu/drm/i915/i915_reset.c > @@ -756,6 +756,7 @@ static void nop_submit_request(struct i915_request *request) > > spin_lock_irqsave(&request->engine->timeline.lock, flags); > __i915_request_submit(request); > + i915_request_fake_complete(request); > intel_engine_write_global_seqno(request->engine, request->global_seqno); > spin_unlock_irqrestore(&request->engine->timeline.lock, flags); > } > diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h > index 5793abe509a2..18be786a970d 100644 > --- a/drivers/gpu/drm/i915/i915_vma.h > +++ b/drivers/gpu/drm/i915/i915_vma.h > @@ -221,6 +221,13 @@ static inline u32 i915_ggtt_offset(const struct i915_vma *vma) > return lower_32_bits(vma->node.start); > } > > +/* XXX inline spaghetti */ > +static inline u32 i915_timeline_seqno_address(const struct i915_timeline *tl) > +{ > + GEM_BUG_ON(!tl->pin_count); > + return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset; > +} > + > static inline u32 i915_ggtt_pin_bias(struct i915_vma *vma) > { > return i915_vm_to_ggtt(vma->vm)->pin_bias; > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index a624e644fbd7..593928dd6bbe 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -827,10 +827,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) > list_for_each_entry(rq, &engine->timeline.requests, link) { > GEM_BUG_ON(!rq->global_seqno); > > - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) > - continue; > + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) > + dma_fence_set_error(&rq->fence, -EIO); > > - dma_fence_set_error(&rq->fence, -EIO); > + i915_request_fake_complete(rq); > } > > /* Flush the queued requests to the timeline list (for retiring). */ > @@ -843,6 +843,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) > > dma_fence_set_error(&rq->fence, -EIO); > __i915_request_submit(rq); > + i915_request_fake_complete(rq); > } > > rb_erase_cached(&p->node, &execlists->queue); > @@ -2022,31 +2023,40 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs) > /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ > BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); > > - cs = gen8_emit_ggtt_write(cs, request->global_seqno, > + cs = gen8_emit_ggtt_write(cs, > + request->fence.seqno, > + i915_timeline_seqno_address(request->timeline)); > + > + cs = gen8_emit_ggtt_write(cs, > + request->global_seqno, > intel_hws_seqno_address(request->engine)); > + > *cs++ = MI_USER_INTERRUPT; > *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > + > request->tail = intel_ring_offset(request, cs); > assert_ring_tail_valid(request->ring, request->tail); > > gen8_emit_wa_tail(request, cs); > } > -static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; > +static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS; > > static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs) > { > - /* We're using qword write, seqno should be aligned to 8 bytes. */ > - BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); > - > cs = gen8_emit_ggtt_write_rcs(cs, > - request->global_seqno, > - intel_hws_seqno_address(request->engine), > + request->fence.seqno, > + i915_timeline_seqno_address(request->timeline), > PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | > PIPE_CONTROL_DEPTH_CACHE_FLUSH | > PIPE_CONTROL_DC_FLUSH_ENABLE | > PIPE_CONTROL_FLUSH_ENABLE | > PIPE_CONTROL_CS_STALL); > > + cs = gen8_emit_ggtt_write_rcs(cs, > + request->global_seqno, > + intel_hws_seqno_address(request->engine), > + PIPE_CONTROL_CS_STALL); > + > *cs++ = MI_USER_INTERRUPT; > *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > > @@ -2055,7 +2065,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs) > > gen8_emit_wa_tail(request, cs); > } > -static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS; > +static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS; > > static int gen8_init_rcs_context(struct i915_request *rq) > { > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 5887304bc3ae..bcc700e7037b 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -325,6 +325,12 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > PIPE_CONTROL_DC_FLUSH_ENABLE | > PIPE_CONTROL_QW_WRITE | > PIPE_CONTROL_CS_STALL); > + *cs++ = i915_timeline_seqno_address(rq->timeline) | > + PIPE_CONTROL_GLOBAL_GTT; > + *cs++ = rq->fence.seqno; > + > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; > *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT; > *cs++ = rq->global_seqno; > > @@ -334,7 +340,7 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int gen6_rcs_emit_breadcrumb_sz = 14; > +static const int gen6_rcs_emit_breadcrumb_sz = 18; > > static int > gen7_render_ring_cs_stall_wa(struct i915_request *rq) > @@ -425,6 +431,13 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > PIPE_CONTROL_QW_WRITE | > PIPE_CONTROL_GLOBAL_GTT_IVB | > PIPE_CONTROL_CS_STALL); > + *cs++ = i915_timeline_seqno_address(rq->timeline); > + *cs++ = rq->fence.seqno; > + > + *cs++ = GFX_OP_PIPE_CONTROL(4); > + *cs++ = (PIPE_CONTROL_QW_WRITE | > + PIPE_CONTROL_GLOBAL_GTT_IVB | > + PIPE_CONTROL_CS_STALL); > *cs++ = intel_hws_seqno_address(rq->engine); > *cs++ = rq->global_seqno; > > @@ -434,27 +447,37 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int gen7_rcs_emit_breadcrumb_sz = 6; > +static const int gen7_rcs_emit_breadcrumb_sz = 10; > > static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > { > - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; > - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; > + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; > + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; > + *cs++ = rq->fence.seqno; > + > + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; > + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT; > *cs++ = rq->global_seqno; > + > *cs++ = MI_USER_INTERRUPT; > + *cs++ = MI_NOOP; > > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int gen6_xcs_emit_breadcrumb_sz = 4; > +static const int gen6_xcs_emit_breadcrumb_sz = 8; > > #define GEN7_XCS_WA 32 > static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > { > int i; > > - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; > - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; > + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; > + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; > + *cs++ = rq->fence.seqno; > + > + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; > + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT; > *cs++ = rq->global_seqno; > > for (i = 0; i < GEN7_XCS_WA; i++) { > @@ -468,12 +491,11 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) > *cs++ = 0; > > *cs++ = MI_USER_INTERRUPT; > - *cs++ = MI_NOOP; > > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3; > +static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3; > #undef GEN7_XCS_WA > > static void set_hwstam(struct intel_engine_cs *engine, u32 mask) > @@ -733,7 +755,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled) > rq = NULL; > spin_lock_irqsave(&tl->lock, flags); > list_for_each_entry(pos, &tl->requests, link) { > - if (!__i915_request_completed(pos, pos->global_seqno)) { > + if (!i915_request_completed(pos)) { > rq = pos; > break; > } > @@ -875,11 +897,10 @@ static void cancel_requests(struct intel_engine_cs *engine) > list_for_each_entry(request, &engine->timeline.requests, link) { > GEM_BUG_ON(!request->global_seqno); > > - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, > - &request->fence.flags)) > - continue; > - > - dma_fence_set_error(&request->fence, -EIO); > + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, > + &request->fence.flags)) > + dma_fence_set_error(&request->fence, -EIO); > + i915_request_fake_complete(request); > } > > intel_write_status_page(engine, > @@ -903,27 +924,38 @@ static void i9xx_submit_request(struct i915_request *request) > > static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs) > { > + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); > + > *cs++ = MI_FLUSH; > > + *cs++ = MI_STORE_DWORD_INDEX; > + *cs++ = I915_GEM_HWS_SEQNO_ADDR; > + *cs++ = rq->fence.seqno; > + > *cs++ = MI_STORE_DWORD_INDEX; > *cs++ = I915_GEM_HWS_INDEX_ADDR; > *cs++ = rq->global_seqno; > > *cs++ = MI_USER_INTERRUPT; > - *cs++ = MI_NOOP; > > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int i9xx_emit_breadcrumb_sz = 6; > +static const int i9xx_emit_breadcrumb_sz = 8; > > #define GEN5_WA_STORES 8 /* must be at least 1! */ > static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) > { > int i; > > + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); > + > *cs++ = MI_FLUSH; > > + *cs++ = MI_STORE_DWORD_INDEX; > + *cs++ = I915_GEM_HWS_SEQNO_ADDR; > + *cs++ = rq->fence.seqno; > + > BUILD_BUG_ON(GEN5_WA_STORES < 1); > for (i = 0; i < GEN5_WA_STORES; i++) { > *cs++ = MI_STORE_DWORD_INDEX; > @@ -932,11 +964,12 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) > } > > *cs++ = MI_USER_INTERRUPT; > + *cs++ = MI_NOOP; > > rq->tail = intel_ring_offset(rq, cs); > assert_ring_tail_valid(rq->ring, rq->tail); > } > -static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2; > +static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 6; > #undef GEN5_WA_STORES > > static void > @@ -1163,6 +1196,10 @@ int intel_ring_pin(struct intel_ring *ring) > > GEM_BUG_ON(ring->vaddr); > > + ret = i915_timeline_pin(ring->timeline); > + if (ret) > + return ret; > + > flags = PIN_GLOBAL; > > /* Ring wraparound at offset 0 sometimes hangs. No idea why. */ > @@ -1179,28 +1216,32 @@ int intel_ring_pin(struct intel_ring *ring) > else > ret = i915_gem_object_set_to_cpu_domain(vma->obj, true); > if (unlikely(ret)) > - return ret; > + goto unpin_timeline; > } > > ret = i915_vma_pin(vma, 0, 0, flags); > if (unlikely(ret)) > - return ret; > + goto unpin_timeline; > > if (i915_vma_is_map_and_fenceable(vma)) > addr = (void __force *)i915_vma_pin_iomap(vma); > else > addr = i915_gem_object_pin_map(vma->obj, map); > - if (IS_ERR(addr)) > - goto err; > + if (IS_ERR(addr)) { > + ret = PTR_ERR(addr); > + goto unpin_ring; > + } > > vma->obj->pin_global++; > > ring->vaddr = addr; > return 0; > > -err: > +unpin_ring: > i915_vma_unpin(vma); > - return PTR_ERR(addr); > +unpin_timeline: > + i915_timeline_unpin(ring->timeline); > + return ret; > } > > void intel_ring_reset(struct intel_ring *ring, u32 tail) > @@ -1229,6 +1270,8 @@ void intel_ring_unpin(struct intel_ring *ring) > > ring->vma->obj->pin_global--; > i915_vma_unpin(ring->vma); > + > + i915_timeline_unpin(ring->timeline); > } > > static struct i915_vma * > diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c > index acd27c7e807b..b4b61056b227 100644 > --- a/drivers/gpu/drm/i915/selftests/mock_engine.c > +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c > @@ -79,6 +79,7 @@ static void advance(struct mock_engine *engine, > struct mock_request *request) > { > list_del_init(&request->link); > + i915_request_fake_complete(&request->base); > mock_seqno_advance(&engine->base, request->base.global_seqno); > } > > @@ -253,16 +254,13 @@ void mock_engine_flush(struct intel_engine_cs *engine) > del_timer_sync(&mock->hw_delay); > > spin_lock_irq(&mock->hw_lock); > - list_for_each_entry_safe(request, rn, &mock->hw_queue, link) { > - list_del_init(&request->link); > - mock_seqno_advance(&mock->base, request->base.global_seqno); > - } > + list_for_each_entry_safe(request, rn, &mock->hw_queue, link) > + advance(mock, request); > spin_unlock_irq(&mock->hw_lock); > } > > void mock_engine_reset(struct intel_engine_cs *engine) > { > - intel_write_status_page(engine, I915_GEM_HWS_INDEX, 0); > } > > void mock_engine_free(struct intel_engine_cs *engine) > Looks good. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3c6091021290..a5bd51987c0d 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2890,7 +2890,7 @@ i915_gem_find_active_request(struct intel_engine_cs *engine) */ spin_lock_irqsave(&engine->timeline.lock, flags); list_for_each_entry(request, &engine->timeline.requests, link) { - if (__i915_request_completed(request, request->global_seqno)) + if (i915_request_completed(request)) continue; active = request; diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 7d068c406a49..0d7b71aff28f 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -614,7 +614,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) rq->ring = ce->ring; rq->timeline = ce->ring->timeline; GEM_BUG_ON(rq->timeline == &engine->timeline); - rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX]; + rq->hwsp_seqno = rq->timeline->hwsp_seqno; spin_lock_init(&rq->lock); dma_fence_init(&rq->fence, diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 4dd22dadf5ce..a16a3b7f7d92 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -324,32 +324,21 @@ static inline u32 hwsp_seqno(const struct i915_request *rq) */ static inline bool i915_request_started(const struct i915_request *rq) { - u32 seqno; - - seqno = i915_request_global_seqno(rq); - if (!seqno) /* not yet submitted to HW */ - return false; - - return i915_seqno_passed(hwsp_seqno(rq), seqno - 1); + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1); } -static inline bool -__i915_request_completed(const struct i915_request *rq, u32 seqno) +static inline bool i915_request_completed(const struct i915_request *rq) { - GEM_BUG_ON(!seqno); - return i915_seqno_passed(hwsp_seqno(rq), seqno) && - seqno == i915_request_global_seqno(rq); + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno); } -static inline bool i915_request_completed(const struct i915_request *rq) +static inline void i915_request_fake_complete(const struct i915_request *rq) { - u32 seqno; - - seqno = i915_request_global_seqno(rq); - if (!seqno) - return false; + /* Don't allow ourselves to accidentally go backwards. */ + if (i915_request_completed(rq)) + return; - return __i915_request_completed(rq, seqno); + WRITE_ONCE(*(u32 *)rq->hwsp_seqno, rq->fence.seqno); } void i915_retire_requests(struct drm_i915_private *i915); diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c index 12e5a2bc825c..eff76558b958 100644 --- a/drivers/gpu/drm/i915/i915_reset.c +++ b/drivers/gpu/drm/i915/i915_reset.c @@ -756,6 +756,7 @@ static void nop_submit_request(struct i915_request *request) spin_lock_irqsave(&request->engine->timeline.lock, flags); __i915_request_submit(request); + i915_request_fake_complete(request); intel_engine_write_global_seqno(request->engine, request->global_seqno); spin_unlock_irqrestore(&request->engine->timeline.lock, flags); } diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index 5793abe509a2..18be786a970d 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -221,6 +221,13 @@ static inline u32 i915_ggtt_offset(const struct i915_vma *vma) return lower_32_bits(vma->node.start); } +/* XXX inline spaghetti */ +static inline u32 i915_timeline_seqno_address(const struct i915_timeline *tl) +{ + GEM_BUG_ON(!tl->pin_count); + return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset; +} + static inline u32 i915_ggtt_pin_bias(struct i915_vma *vma) { return i915_vm_to_ggtt(vma->vm)->pin_bias; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index a624e644fbd7..593928dd6bbe 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -827,10 +827,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) list_for_each_entry(rq, &engine->timeline.requests, link) { GEM_BUG_ON(!rq->global_seqno); - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) - continue; + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags)) + dma_fence_set_error(&rq->fence, -EIO); - dma_fence_set_error(&rq->fence, -EIO); + i915_request_fake_complete(rq); } /* Flush the queued requests to the timeline list (for retiring). */ @@ -843,6 +843,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) dma_fence_set_error(&rq->fence, -EIO); __i915_request_submit(rq); + i915_request_fake_complete(rq); } rb_erase_cached(&p->node, &execlists->queue); @@ -2022,31 +2023,40 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs) /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); - cs = gen8_emit_ggtt_write(cs, request->global_seqno, + cs = gen8_emit_ggtt_write(cs, + request->fence.seqno, + i915_timeline_seqno_address(request->timeline)); + + cs = gen8_emit_ggtt_write(cs, + request->global_seqno, intel_hws_seqno_address(request->engine)); + *cs++ = MI_USER_INTERRUPT; *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + request->tail = intel_ring_offset(request, cs); assert_ring_tail_valid(request->ring, request->tail); gen8_emit_wa_tail(request, cs); } -static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; +static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS; static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs) { - /* We're using qword write, seqno should be aligned to 8 bytes. */ - BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); - cs = gen8_emit_ggtt_write_rcs(cs, - request->global_seqno, - intel_hws_seqno_address(request->engine), + request->fence.seqno, + i915_timeline_seqno_address(request->timeline), PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_DC_FLUSH_ENABLE | PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL); + cs = gen8_emit_ggtt_write_rcs(cs, + request->global_seqno, + intel_hws_seqno_address(request->engine), + PIPE_CONTROL_CS_STALL); + *cs++ = MI_USER_INTERRUPT; *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; @@ -2055,7 +2065,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs) gen8_emit_wa_tail(request, cs); } -static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS; +static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS; static int gen8_init_rcs_context(struct i915_request *rq) { diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 5887304bc3ae..bcc700e7037b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -325,6 +325,12 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) PIPE_CONTROL_DC_FLUSH_ENABLE | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL); + *cs++ = i915_timeline_seqno_address(rq->timeline) | + PIPE_CONTROL_GLOBAL_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = rq->global_seqno; @@ -334,7 +340,7 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int gen6_rcs_emit_breadcrumb_sz = 14; +static const int gen6_rcs_emit_breadcrumb_sz = 18; static int gen7_render_ring_cs_stall_wa(struct i915_request *rq) @@ -425,6 +431,13 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL); + *cs++ = i915_timeline_seqno_address(rq->timeline); + *cs++ = rq->fence.seqno; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL); *cs++ = intel_hws_seqno_address(rq->engine); *cs++ = rq->global_seqno; @@ -434,27 +447,37 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int gen7_rcs_emit_breadcrumb_sz = 6; +static const int gen7_rcs_emit_breadcrumb_sz = 10; static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) { - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT; *cs++ = rq->global_seqno; + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int gen6_xcs_emit_breadcrumb_sz = 4; +static const int gen6_xcs_emit_breadcrumb_sz = 8; #define GEN7_XCS_WA 32 static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) { int i; - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW; - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT; + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT; *cs++ = rq->global_seqno; for (i = 0; i < GEN7_XCS_WA; i++) { @@ -468,12 +491,11 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) *cs++ = 0; *cs++ = MI_USER_INTERRUPT; - *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3; +static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3; #undef GEN7_XCS_WA static void set_hwstam(struct intel_engine_cs *engine, u32 mask) @@ -733,7 +755,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled) rq = NULL; spin_lock_irqsave(&tl->lock, flags); list_for_each_entry(pos, &tl->requests, link) { - if (!__i915_request_completed(pos, pos->global_seqno)) { + if (!i915_request_completed(pos)) { rq = pos; break; } @@ -875,11 +897,10 @@ static void cancel_requests(struct intel_engine_cs *engine) list_for_each_entry(request, &engine->timeline.requests, link) { GEM_BUG_ON(!request->global_seqno); - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, - &request->fence.flags)) - continue; - - dma_fence_set_error(&request->fence, -EIO); + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &request->fence.flags)) + dma_fence_set_error(&request->fence, -EIO); + i915_request_fake_complete(request); } intel_write_status_page(engine, @@ -903,27 +924,38 @@ static void i9xx_submit_request(struct i915_request *request) static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs) { + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + *cs++ = MI_FLUSH; + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_INDEX_ADDR; *cs++ = rq->global_seqno; *cs++ = MI_USER_INTERRUPT; - *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int i9xx_emit_breadcrumb_sz = 6; +static const int i9xx_emit_breadcrumb_sz = 8; #define GEN5_WA_STORES 8 /* must be at least 1! */ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) { int i; + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + *cs++ = MI_FLUSH; + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + BUILD_BUG_ON(GEN5_WA_STORES < 1); for (i = 0; i < GEN5_WA_STORES; i++) { *cs++ = MI_STORE_DWORD_INDEX; @@ -932,11 +964,12 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) } *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); } -static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2; +static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 6; #undef GEN5_WA_STORES static void @@ -1163,6 +1196,10 @@ int intel_ring_pin(struct intel_ring *ring) GEM_BUG_ON(ring->vaddr); + ret = i915_timeline_pin(ring->timeline); + if (ret) + return ret; + flags = PIN_GLOBAL; /* Ring wraparound at offset 0 sometimes hangs. No idea why. */ @@ -1179,28 +1216,32 @@ int intel_ring_pin(struct intel_ring *ring) else ret = i915_gem_object_set_to_cpu_domain(vma->obj, true); if (unlikely(ret)) - return ret; + goto unpin_timeline; } ret = i915_vma_pin(vma, 0, 0, flags); if (unlikely(ret)) - return ret; + goto unpin_timeline; if (i915_vma_is_map_and_fenceable(vma)) addr = (void __force *)i915_vma_pin_iomap(vma); else addr = i915_gem_object_pin_map(vma->obj, map); - if (IS_ERR(addr)) - goto err; + if (IS_ERR(addr)) { + ret = PTR_ERR(addr); + goto unpin_ring; + } vma->obj->pin_global++; ring->vaddr = addr; return 0; -err: +unpin_ring: i915_vma_unpin(vma); - return PTR_ERR(addr); +unpin_timeline: + i915_timeline_unpin(ring->timeline); + return ret; } void intel_ring_reset(struct intel_ring *ring, u32 tail) @@ -1229,6 +1270,8 @@ void intel_ring_unpin(struct intel_ring *ring) ring->vma->obj->pin_global--; i915_vma_unpin(ring->vma); + + i915_timeline_unpin(ring->timeline); } static struct i915_vma * diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c index acd27c7e807b..b4b61056b227 100644 --- a/drivers/gpu/drm/i915/selftests/mock_engine.c +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c @@ -79,6 +79,7 @@ static void advance(struct mock_engine *engine, struct mock_request *request) { list_del_init(&request->link); + i915_request_fake_complete(&request->base); mock_seqno_advance(&engine->base, request->base.global_seqno); } @@ -253,16 +254,13 @@ void mock_engine_flush(struct intel_engine_cs *engine) del_timer_sync(&mock->hw_delay); spin_lock_irq(&mock->hw_lock); - list_for_each_entry_safe(request, rn, &mock->hw_queue, link) { - list_del_init(&request->link); - mock_seqno_advance(&mock->base, request->base.global_seqno); - } + list_for_each_entry_safe(request, rn, &mock->hw_queue, link) + advance(mock, request); spin_unlock_irq(&mock->hw_lock); } void mock_engine_reset(struct intel_engine_cs *engine) { - intel_write_status_page(engine, I915_GEM_HWS_INDEX, 0); } void mock_engine_free(struct intel_engine_cs *engine)
Now that we have allocated ourselves a cacheline to store a breadcrumb, we can emit a write from the GPU into the timeline's HWSP of the per-context seqno as we complete each request. This drops the mirroring of the per-engine HWSP and allows each context to operate independently. We do not need to unwind the per-context timeline, and so requests are always consistent with the timeline breadcrumb, greatly simplifying the completion checks as we no longer need to be concerned about the global_seqno changing mid check. At this point, we are emitting both per-context and global seqno and still using the single per-engine execution timeline for resolving interrupts. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_gem.c | 2 +- drivers/gpu/drm/i915/i915_request.c | 2 +- drivers/gpu/drm/i915/i915_request.h | 27 ++---- drivers/gpu/drm/i915/i915_reset.c | 1 + drivers/gpu/drm/i915/i915_vma.h | 7 ++ drivers/gpu/drm/i915/intel_lrc.c | 32 ++++--- drivers/gpu/drm/i915/intel_ringbuffer.c | 91 ++++++++++++++------ drivers/gpu/drm/i915/selftests/mock_engine.c | 8 +- 8 files changed, 109 insertions(+), 61 deletions(-)