diff mbox

[2/6] drm/i915: Retire requests along rings

Message ID 20180424131415.15102-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson April 24, 2018, 1:14 p.m. UTC
In the next patch, rings are the central timeline as requests may jump
between engines. Therefore in the future as we retire in order along the
engine timeline, we may retire out-of-order within a ring (as the ring now
occurs along multiple engines), leading to much hilarity in miscomputing
the position of ring->head.

As an added bonus, retiring along the ring reduces the penalty of having
one execlists client do cleanup for another (old legacy submission
shares a ring between all clients). The downside is that slow and
irregular (off the critical path) process of cleaning up stale requests
after userspace becomes a modicum less efficient.

In the long run, it will become apparent that the ordered
ring->request_list matches the ring->timeline, a fun challenge for the
future will be unifying the two lists to avoid duplication!

v2: We need both engine-order and ring-order processing to maintain our
knowledge of where individual rings have completed upto as well as
knowing what was last executing on any engine. And finally by decoupling
retiring the contexts on the engine and the timelines along the rings,
we do have to keep a reference to the context on each request
(previously it was guaranteed by the context being pinned).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h               |   3 +-
 drivers/gpu/drm/i915/i915_gem.c               |   1 +
 drivers/gpu/drm/i915/i915_request.c           | 140 +++++++++++-------
 drivers/gpu/drm/i915/i915_utils.h             |   6 +
 drivers/gpu/drm/i915/intel_ringbuffer.c       |   6 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   1 +
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  27 +++-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   2 +
 8 files changed, 120 insertions(+), 66 deletions(-)

Comments

Chris Wilson April 24, 2018, 1:19 p.m. UTC | #1
Quoting Chris Wilson (2018-04-24 14:14:11)
> In the next patch, rings are the central timeline as requests may jump
> between engines. Therefore in the future as we retire in order along the
> engine timeline, we may retire out-of-order within a ring (as the ring now
> occurs along multiple engines), leading to much hilarity in miscomputing
> the position of ring->head.
> 
> As an added bonus, retiring along the ring reduces the penalty of having
> one execlists client do cleanup for another (old legacy submission
> shares a ring between all clients). The downside is that slow and
> irregular (off the critical path) process of cleaning up stale requests
> after userspace becomes a modicum less efficient.
> 
> In the long run, it will become apparent that the ordered
> ring->request_list matches the ring->timeline, a fun challenge for the
> future will be unifying the two lists to avoid duplication!
> 
> v2: We need both engine-order and ring-order processing to maintain our
> knowledge of where individual rings have completed upto as well as
> knowing what was last executing on any engine. And finally by decoupling
> retiring the contexts on the engine and the timelines along the rings,
> we do have to keep a reference to the context on each request
> (previously it was guaranteed by the context being pinned).
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

As might have been expected, this is the one that gen8 takes a dislike
to. The symptom being a failed context switch leading to an
(eventual) unrecoverable GPU hang.

Afaict, I have all the asserts in place to ensure that we are tracking
the previous context correctly and only unpinning it after we guarantee
it is no longer in use by the GPU. Ideas? Can you see my mistake? :(

> ---
>  drivers/gpu/drm/i915/i915_drv.h               |   3 +-
>  drivers/gpu/drm/i915/i915_gem.c               |   1 +
>  drivers/gpu/drm/i915/i915_request.c           | 140 +++++++++++-------
>  drivers/gpu/drm/i915/i915_utils.h             |   6 +
>  drivers/gpu/drm/i915/intel_ringbuffer.c       |   6 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.h       |   1 +
>  drivers/gpu/drm/i915/selftests/mock_engine.c  |  27 +++-
>  .../gpu/drm/i915/selftests/mock_gem_device.c  |   2 +
>  8 files changed, 120 insertions(+), 66 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8fd9fb6efba5..1837c01d44d0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2058,8 +2058,9 @@ struct drm_i915_private {
>                 void (*resume)(struct drm_i915_private *);
>                 void (*cleanup_engine)(struct intel_engine_cs *engine);
>  
> -               struct list_head timelines;
>                 struct i915_gem_timeline global_timeline;
> +               struct list_head timelines;
> +               struct list_head rings;
>                 u32 active_requests;
>                 u32 request_serial;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 795ca83aed7a..906e2395c245 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5600,6 +5600,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>                 goto err_dependencies;
>  
>         mutex_lock(&dev_priv->drm.struct_mutex);
> +       INIT_LIST_HEAD(&dev_priv->gt.rings);
>         INIT_LIST_HEAD(&dev_priv->gt.timelines);
>         err = i915_gem_timeline_init__global(dev_priv);
>         mutex_unlock(&dev_priv->drm.struct_mutex);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index b1993d4a1a53..aa84b5447fd5 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -286,6 +286,7 @@ static int reserve_gt(struct drm_i915_private *i915)
>  
>  static void unreserve_gt(struct drm_i915_private *i915)
>  {
> +       GEM_BUG_ON(!i915->gt.active_requests);
>         if (!--i915->gt.active_requests)
>                 i915_gem_park(i915);
>  }
> @@ -298,6 +299,7 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
>  
>  static void advance_ring(struct i915_request *request)
>  {
> +       struct intel_ring *ring = request->ring;
>         unsigned int tail;
>  
>         /*
> @@ -309,7 +311,8 @@ static void advance_ring(struct i915_request *request)
>          * Note this requires that we are always called in request
>          * completion order.
>          */
> -       if (list_is_last(&request->ring_link, &request->ring->request_list)) {
> +       GEM_BUG_ON(!list_is_first(&request->ring_link, &ring->request_list));
> +       if (list_is_last(&request->ring_link, &ring->request_list)) {
>                 /*
>                  * We may race here with execlists resubmitting this request
>                  * as we retire it. The resubmission will move the ring->tail
> @@ -322,9 +325,9 @@ static void advance_ring(struct i915_request *request)
>         } else {
>                 tail = request->postfix;
>         }
> -       list_del(&request->ring_link);
> +       list_del_init(&request->ring_link);
>  
> -       request->ring->head = tail;
> +       ring->head = tail;
>  }
>  
>  static void free_capture_list(struct i915_request *request)
> @@ -340,30 +343,79 @@ static void free_capture_list(struct i915_request *request)
>         }
>  }
>  
> +static void __retire_engine_request(struct intel_engine_cs *engine,
> +                                   struct i915_request *request)
> +{
> +       GEM_BUG_ON(!i915_request_completed(request));
> +
> +       local_irq_disable();
> +
> +       spin_lock(&engine->timeline->lock);
> +       GEM_BUG_ON(!list_is_first(&request->link, &engine->timeline->requests));
> +       list_del_init(&request->link);
> +       spin_unlock(&engine->timeline->lock);
> +
> +       spin_lock(&request->lock);
> +       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> +               dma_fence_signal_locked(&request->fence);
> +       if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> +               intel_engine_cancel_signaling(request);
> +       if (request->waitboost) {
> +               GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
> +               atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> +       }
> +       spin_unlock(&request->lock);
> +
> +       local_irq_enable();
> +
> +       /*
> +        * The backing object for the context is done after switching to the
> +        * *next* context. Therefore we cannot retire the previous context until
> +        * the next context has already started running. However, since we
> +        * cannot take the required locks at i915_request_submit() we
> +        * defer the unpinning of the active context to now, retirement of
> +        * the subsequent request.
> +        */
> +       if (engine->last_retired_context)
> +               engine->context_unpin(engine, engine->last_retired_context);
> +       engine->last_retired_context = request->ctx;
> +}
> +
> +static void __retire_engine_upto(struct intel_engine_cs *engine,
> +                                struct i915_request *rq)
> +{
> +       struct i915_request *tmp;
> +
> +       GEM_BUG_ON(!i915_request_completed(rq));
> +       if (list_empty(&rq->link))
> +               return;
> +
> +       do {
> +               tmp = list_first_entry(&engine->timeline->requests,
> +                                      typeof(*tmp), link);
> +
> +               GEM_BUG_ON(tmp->engine != engine);
> +               __retire_engine_request(engine, tmp);
> +       } while (tmp != rq);
> +}
> +
>  static void i915_request_retire(struct i915_request *request)
>  {
> -       struct intel_engine_cs *engine = request->engine;
>         struct i915_gem_active *active, *next;
>  
>         GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n",
> -                 engine->name,
> +                 request->engine->name,
>                   request->fence.context, request->fence.seqno,
>                   request->global_seqno,
> -                 intel_engine_get_seqno(engine));
> +                 intel_engine_get_seqno(request->engine));
>  
>         lockdep_assert_held(&request->i915->drm.struct_mutex);
>         GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
>         GEM_BUG_ON(!i915_request_completed(request));
> -       GEM_BUG_ON(!request->i915->gt.active_requests);
>  
>         trace_i915_request_retire(request);
>  
> -       spin_lock_irq(&engine->timeline->lock);
> -       list_del_init(&request->link);
> -       spin_unlock_irq(&engine->timeline->lock);
> -
>         advance_ring(request);
> -
>         free_capture_list(request);
>  
>         /*
> @@ -400,49 +452,30 @@ static void i915_request_retire(struct i915_request *request)
>         /* Retirement decays the ban score as it is a sign of ctx progress */
>         atomic_dec_if_positive(&request->ctx->ban_score);
>  
> -       /*
> -        * The backing object for the context is done after switching to the
> -        * *next* context. Therefore we cannot retire the previous context until
> -        * the next context has already started running. However, since we
> -        * cannot take the required locks at i915_request_submit() we
> -        * defer the unpinning of the active context to now, retirement of
> -        * the subsequent request.
> -        */
> -       if (engine->last_retired_context)
> -               engine->context_unpin(engine, engine->last_retired_context);
> -       engine->last_retired_context = request->ctx;
> -
> -       spin_lock_irq(&request->lock);
> -       if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> -               dma_fence_signal_locked(&request->fence);
> -       if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> -               intel_engine_cancel_signaling(request);
> -       if (request->waitboost) {
> -               GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
> -               atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> -       }
> -       spin_unlock_irq(&request->lock);
> +       __retire_engine_upto(request->engine, request);
>  
>         unreserve_gt(request->i915);
>  
>         i915_sched_node_fini(request->i915, &request->sched);
> +
> +       i915_gem_context_put(request->ctx);
>         i915_request_put(request);
>  }
>  
>  void i915_request_retire_upto(struct i915_request *rq)
>  {
> -       struct intel_engine_cs *engine = rq->engine;
> +       struct intel_ring *ring = rq->ring;
>         struct i915_request *tmp;
>  
>         lockdep_assert_held(&rq->i915->drm.struct_mutex);
>         GEM_BUG_ON(!i915_request_completed(rq));
>  
> -       if (list_empty(&rq->link))
> +       if (list_empty(&rq->ring_link))
>                 return;
>  
>         do {
> -               tmp = list_first_entry(&engine->timeline->requests,
> -                                      typeof(*tmp), link);
> +               tmp = list_first_entry(&ring->request_list,
> +                                      typeof(*tmp), ring_link);
>  
>                 i915_request_retire(tmp);
>         } while (tmp != rq);
> @@ -651,9 +684,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>         if (ret)
>                 goto err_unreserve;
>  
> -       /* Move the oldest request to the slab-cache (if not in use!) */
> -       rq = list_first_entry_or_null(&engine->timeline->requests,
> -                                     typeof(*rq), link);
> +       /* Move our oldest request to the slab-cache (if not in use!) */
> +       rq = list_first_entry_or_null(&ring->request_list,
> +                                     typeof(*rq), ring_link);
>         if (rq && i915_request_completed(rq))
>                 i915_request_retire(rq);
>  
> @@ -733,7 +766,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>         INIT_LIST_HEAD(&rq->active_list);
>         rq->i915 = i915;
>         rq->engine = engine;
> -       rq->ctx = ctx;
> +       rq->ctx = i915_gem_context_get(ctx);
>         rq->ring = ring;
>  
>         /* No zalloc, must clear what we need by hand */
> @@ -777,6 +810,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>  
>  err_unwind:
>         rq->ring->emit = rq->head;
> +       i915_gem_context_put(ctx);
>  
>         /* Make sure we didn't add ourselves to external state before freeing */
>         GEM_BUG_ON(!list_empty(&rq->active_list));
> @@ -1357,38 +1391,30 @@ long i915_request_wait(struct i915_request *rq,
>         return timeout;
>  }
>  
> -static void engine_retire_requests(struct intel_engine_cs *engine)
> +static void ring_retire_requests(struct intel_ring *ring)
>  {
>         struct i915_request *request, *next;
> -       u32 seqno = intel_engine_get_seqno(engine);
> -       LIST_HEAD(retire);
>  
> -       spin_lock_irq(&engine->timeline->lock);
>         list_for_each_entry_safe(request, next,
> -                                &engine->timeline->requests, link) {
> -               if (!i915_seqno_passed(seqno, request->global_seqno))
> +                                &ring->request_list, ring_link) {
> +               if (!i915_request_completed(request))
>                         break;
>  
> -               list_move_tail(&request->link, &retire);
> -       }
> -       spin_unlock_irq(&engine->timeline->lock);
> -
> -       list_for_each_entry_safe(request, next, &retire, link)
>                 i915_request_retire(request);
> +       }
>  }
>  
>  void i915_retire_requests(struct drm_i915_private *i915)
>  {
> -       struct intel_engine_cs *engine;
> -       enum intel_engine_id id;
> +       struct intel_ring *ring, *next;
>  
>         lockdep_assert_held(&i915->drm.struct_mutex);
>  
>         if (!i915->gt.active_requests)
>                 return;
>  
> -       for_each_engine(engine, i915, id)
> -               engine_retire_requests(engine);
> +       list_for_each_entry_safe(ring, next, &i915->gt.rings, link)
> +               ring_retire_requests(ring);
>  }
>  
>  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index 0695717522ea..00165ad55fb3 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -120,6 +120,12 @@ static inline u64 ptr_to_u64(const void *ptr)
>  
>  #include <linux/list.h>
>  
> +static inline int list_is_first(const struct list_head *list,
> +                               const struct list_head *head)
> +{
> +       return head->next == list;
> +}
> +
>  static inline void __list_del_many(struct list_head *head,
>                                    struct list_head *first)
>  {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index c68ac605b8a9..75dca28782ee 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1065,7 +1065,6 @@ int intel_ring_pin(struct intel_ring *ring,
>  
>  void intel_ring_reset(struct intel_ring *ring, u32 tail)
>  {
> -       GEM_BUG_ON(!list_empty(&ring->request_list));
>         ring->tail = tail;
>         ring->head = tail;
>         ring->emit = tail;
> @@ -1124,6 +1123,7 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
>  
>         GEM_BUG_ON(!is_power_of_2(size));
>         GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
> +       lockdep_assert_held(&engine->i915->drm.struct_mutex);
>  
>         ring = kzalloc(sizeof(*ring), GFP_KERNEL);
>         if (!ring)
> @@ -1149,6 +1149,8 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
>         }
>         ring->vma = vma;
>  
> +       list_add(&ring->link, &engine->i915->gt.rings);
> +
>         return ring;
>  }
>  
> @@ -1160,6 +1162,8 @@ intel_ring_free(struct intel_ring *ring)
>         i915_vma_close(ring->vma);
>         __i915_gem_object_release_unless_active(obj);
>  
> +       list_del(&ring->link);
> +
>         kfree(ring);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c5e27905b0e1..d816f8dea245 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -129,6 +129,7 @@ struct intel_ring {
>         void *vaddr;
>  
>         struct list_head request_list;
> +       struct list_head link;
>  
>         u32 head;
>         u32 tail;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 78a89efa1119..a0bc324edadd 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -140,9 +140,18 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
>         INIT_LIST_HEAD(&ring->request_list);
>         intel_ring_update_space(ring);
>  
> +       list_add(&ring->link, &engine->i915->gt.rings);
> +
>         return ring;
>  }
>  
> +static void mock_ring_free(struct intel_ring *ring)
> +{
> +       list_del(&ring->link);
> +
> +       kfree(ring);
> +}
> +
>  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>                                     const char *name,
>                                     int id)
> @@ -155,12 +164,6 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>         if (!engine)
>                 return NULL;
>  
> -       engine->base.buffer = mock_ring(&engine->base);
> -       if (!engine->base.buffer) {
> -               kfree(engine);
> -               return NULL;
> -       }
> -
>         /* minimal engine setup for requests */
>         engine->base.i915 = i915;
>         snprintf(engine->base.name, sizeof(engine->base.name), "%s", name);
> @@ -185,7 +188,16 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>         timer_setup(&engine->hw_delay, hw_delay_complete, 0);
>         INIT_LIST_HEAD(&engine->hw_queue);
>  
> +       engine->base.buffer = mock_ring(&engine->base);
> +       if (!engine->base.buffer)
> +               goto err_breadcrumbs;
> +
>         return &engine->base;
> +
> +err_breadcrumbs:
> +       intel_engine_fini_breadcrumbs(&engine->base);
> +       kfree(engine);
> +       return NULL;
>  }
>  
>  void mock_engine_flush(struct intel_engine_cs *engine)
> @@ -219,8 +231,9 @@ void mock_engine_free(struct intel_engine_cs *engine)
>         if (engine->last_retired_context)
>                 engine->context_unpin(engine, engine->last_retired_context);
>  
> +       mock_ring_free(engine->buffer);
> +
>         intel_engine_fini_breadcrumbs(engine);
>  
> -       kfree(engine->buffer);
>         kfree(engine);
>  }
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index e6d4b882599a..ac4bacf8b5b9 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -44,6 +44,7 @@ void mock_device_flush(struct drm_i915_private *i915)
>                 mock_engine_flush(engine);
>  
>         i915_retire_requests(i915);
> +       GEM_BUG_ON(i915->gt.active_requests);
>  }
>  
>  static void mock_device_release(struct drm_device *dev)
> @@ -224,6 +225,7 @@ struct drm_i915_private *mock_gem_device(void)
>                 goto err_dependencies;
>  
>         mutex_lock(&i915->drm.struct_mutex);
> +       INIT_LIST_HEAD(&i915->gt.rings);
>         INIT_LIST_HEAD(&i915->gt.timelines);
>         err = i915_gem_timeline_init__global(i915);
>         if (err) {
> -- 
> 2.17.0
>
Tvrtko Ursulin April 24, 2018, 2:46 p.m. UTC | #2
On 24/04/2018 14:14, Chris Wilson wrote:
> In the next patch, rings are the central timeline as requests may jump
> between engines. Therefore in the future as we retire in order along the
> engine timeline, we may retire out-of-order within a ring (as the ring now
> occurs along multiple engines), leading to much hilarity in miscomputing
> the position of ring->head.
> 
> As an added bonus, retiring along the ring reduces the penalty of having
> one execlists client do cleanup for another (old legacy submission
> shares a ring between all clients). The downside is that slow and
> irregular (off the critical path) process of cleaning up stale requests
> after userspace becomes a modicum less efficient.
> 
> In the long run, it will become apparent that the ordered
> ring->request_list matches the ring->timeline, a fun challenge for the
> future will be unifying the two lists to avoid duplication!
> 
> v2: We need both engine-order and ring-order processing to maintain our
> knowledge of where individual rings have completed upto as well as
> knowing what was last executing on any engine. And finally by decoupling
> retiring the contexts on the engine and the timelines along the rings,
> we do have to keep a reference to the context on each request
> (previously it was guaranteed by the context being pinned).
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h               |   3 +-
>   drivers/gpu/drm/i915/i915_gem.c               |   1 +
>   drivers/gpu/drm/i915/i915_request.c           | 140 +++++++++++-------
>   drivers/gpu/drm/i915/i915_utils.h             |   6 +
>   drivers/gpu/drm/i915/intel_ringbuffer.c       |   6 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   1 +
>   drivers/gpu/drm/i915/selftests/mock_engine.c  |  27 +++-
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |   2 +
>   8 files changed, 120 insertions(+), 66 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8fd9fb6efba5..1837c01d44d0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2058,8 +2058,9 @@ struct drm_i915_private {
>   		void (*resume)(struct drm_i915_private *);
>   		void (*cleanup_engine)(struct intel_engine_cs *engine);
>   
> -		struct list_head timelines;
>   		struct i915_gem_timeline global_timeline;
> +		struct list_head timelines;
> +		struct list_head rings;
>   		u32 active_requests;
>   		u32 request_serial;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 795ca83aed7a..906e2395c245 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5600,6 +5600,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>   		goto err_dependencies;
>   
>   	mutex_lock(&dev_priv->drm.struct_mutex);
> +	INIT_LIST_HEAD(&dev_priv->gt.rings);
>   	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>   	err = i915_gem_timeline_init__global(dev_priv);
>   	mutex_unlock(&dev_priv->drm.struct_mutex);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index b1993d4a1a53..aa84b5447fd5 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -286,6 +286,7 @@ static int reserve_gt(struct drm_i915_private *i915)
>   
>   static void unreserve_gt(struct drm_i915_private *i915)
>   {
> +	GEM_BUG_ON(!i915->gt.active_requests);
>   	if (!--i915->gt.active_requests)
>   		i915_gem_park(i915);
>   }
> @@ -298,6 +299,7 @@ void i915_gem_retire_noop(struct i915_gem_active *active,
>   
>   static void advance_ring(struct i915_request *request)
>   {
> +	struct intel_ring *ring = request->ring;
>   	unsigned int tail;
>   
>   	/*
> @@ -309,7 +311,8 @@ static void advance_ring(struct i915_request *request)
>   	 * Note this requires that we are always called in request
>   	 * completion order.
>   	 */
> -	if (list_is_last(&request->ring_link, &request->ring->request_list)) {
> +	GEM_BUG_ON(!list_is_first(&request->ring_link, &ring->request_list));
> +	if (list_is_last(&request->ring_link, &ring->request_list)) {
>   		/*
>   		 * We may race here with execlists resubmitting this request
>   		 * as we retire it. The resubmission will move the ring->tail
> @@ -322,9 +325,9 @@ static void advance_ring(struct i915_request *request)
>   	} else {
>   		tail = request->postfix;
>   	}
> -	list_del(&request->ring_link);
> +	list_del_init(&request->ring_link);

Why the _init flavour? There are two list heads for being on two 
separate lists, but are either path reachable after being removed from 
the respective lists? (I may find the answer as I read on.)

>   
> -	request->ring->head = tail;
> +	ring->head = tail;
>   }
>   
>   static void free_capture_list(struct i915_request *request)
> @@ -340,30 +343,79 @@ static void free_capture_list(struct i915_request *request)
>   	}
>   }
>   
> +static void __retire_engine_request(struct intel_engine_cs *engine,
> +				    struct i915_request *request)
> +{
> +	GEM_BUG_ON(!i915_request_completed(request));
> +
> +	local_irq_disable();
> +
> +	spin_lock(&engine->timeline->lock);
> +	GEM_BUG_ON(!list_is_first(&request->link, &engine->timeline->requests));
> +	list_del_init(&request->link);
> +	spin_unlock(&engine->timeline->lock);
> +
> +	spin_lock(&request->lock);
> +	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> +		dma_fence_signal_locked(&request->fence);
> +	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> +		intel_engine_cancel_signaling(request);
> +	if (request->waitboost) {
> +		GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
> +		atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> +	}
> +	spin_unlock(&request->lock);
> +
> +	local_irq_enable();
> +
> +	/*
> +	 * The backing object for the context is done after switching to the
> +	 * *next* context. Therefore we cannot retire the previous context until
> +	 * the next context has already started running. However, since we
> +	 * cannot take the required locks at i915_request_submit() we
> +	 * defer the unpinning of the active context to now, retirement of
> +	 * the subsequent request.
> +	 */
> +	if (engine->last_retired_context)
> +		engine->context_unpin(engine, engine->last_retired_context);
> +	engine->last_retired_context = request->ctx;
> +}
> +
> +static void __retire_engine_upto(struct intel_engine_cs *engine,
> +				 struct i915_request *rq)
> +{
> +	struct i915_request *tmp;
> +
> +	GEM_BUG_ON(!i915_request_completed(rq));
> +	if (list_empty(&rq->link))
> +		return;
> +
> +	do {
> +		tmp = list_first_entry(&engine->timeline->requests,
> +				       typeof(*tmp), link);
> +
> +		GEM_BUG_ON(tmp->engine != engine);
> +		__retire_engine_request(engine, tmp);
> +	} while (tmp != rq);
> +}
> +
>   static void i915_request_retire(struct i915_request *request)
>   {
> -	struct intel_engine_cs *engine = request->engine;
>   	struct i915_gem_active *active, *next;
>   
>   	GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n",
> -		  engine->name,
> +		  request->engine->name,
>   		  request->fence.context, request->fence.seqno,
>   		  request->global_seqno,
> -		  intel_engine_get_seqno(engine));
> +		  intel_engine_get_seqno(request->engine));
>   
>   	lockdep_assert_held(&request->i915->drm.struct_mutex);
>   	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
>   	GEM_BUG_ON(!i915_request_completed(request));
> -	GEM_BUG_ON(!request->i915->gt.active_requests);
>   
>   	trace_i915_request_retire(request);
>   
> -	spin_lock_irq(&engine->timeline->lock);
> -	list_del_init(&request->link);
> -	spin_unlock_irq(&engine->timeline->lock);
> -
>   	advance_ring(request);
> -
>   	free_capture_list(request);
>   
>   	/*
> @@ -400,49 +452,30 @@ static void i915_request_retire(struct i915_request *request)
>   	/* Retirement decays the ban score as it is a sign of ctx progress */
>   	atomic_dec_if_positive(&request->ctx->ban_score);
>   
> -	/*
> -	 * The backing object for the context is done after switching to the
> -	 * *next* context. Therefore we cannot retire the previous context until
> -	 * the next context has already started running. However, since we
> -	 * cannot take the required locks at i915_request_submit() we
> -	 * defer the unpinning of the active context to now, retirement of
> -	 * the subsequent request.
> -	 */
> -	if (engine->last_retired_context)
> -		engine->context_unpin(engine, engine->last_retired_context);
> -	engine->last_retired_context = request->ctx;
> -
> -	spin_lock_irq(&request->lock);
> -	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> -		dma_fence_signal_locked(&request->fence);
> -	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> -		intel_engine_cancel_signaling(request);
> -	if (request->waitboost) {
> -		GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
> -		atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> -	}
> -	spin_unlock_irq(&request->lock);
> +	__retire_engine_upto(request->engine, request);
>   

I did not figure out why do you need to do this. Normally retirement 
driven from ring timelines would retire requests on the same engine 
belonging to a different ring, as it walks over ring timelines.

Only for direct callers of i915_request_retire it may make a difference 
but I don't understand is it an functional difference or just optimisation.

This is then from where list_del_init comes from, since this function 
can retire wider than the caller would expect. But then it retires on 
the engine (upto) and the callers just walks the list and calls retire 
only to find already unlinked elements. Could it just as well then 
retire it completely?

>   	unreserve_gt(request->i915);
>   
>   	i915_sched_node_fini(request->i915, &request->sched);
> +
> +	i915_gem_context_put(request->ctx);
>   	i915_request_put(request);
>   }
>   
>   void i915_request_retire_upto(struct i915_request *rq)
>   {
> -	struct intel_engine_cs *engine = rq->engine;
> +	struct intel_ring *ring = rq->ring;
>   	struct i915_request *tmp;
>   
>   	lockdep_assert_held(&rq->i915->drm.struct_mutex);
>   	GEM_BUG_ON(!i915_request_completed(rq));
>   
> -	if (list_empty(&rq->link))
> +	if (list_empty(&rq->ring_link))
>   		return;
>   
>   	do {
> -		tmp = list_first_entry(&engine->timeline->requests,
> -				       typeof(*tmp), link);
> +		tmp = list_first_entry(&ring->request_list,
> +				       typeof(*tmp), ring_link);
>   
>   		i915_request_retire(tmp);
>   	} while (tmp != rq);
> @@ -651,9 +684,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	if (ret)
>   		goto err_unreserve;
>   
> -	/* Move the oldest request to the slab-cache (if not in use!) */
> -	rq = list_first_entry_or_null(&engine->timeline->requests,
> -				      typeof(*rq), link);
> +	/* Move our oldest request to the slab-cache (if not in use!) */
> +	rq = list_first_entry_or_null(&ring->request_list,
> +				      typeof(*rq), ring_link);

This is less efficient now - I wonder if you should still be looking at 
the engine timeline here?

>   	if (rq && i915_request_completed(rq))
>   		i915_request_retire(rq);
>   
> @@ -733,7 +766,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	INIT_LIST_HEAD(&rq->active_list);
>   	rq->i915 = i915;
>   	rq->engine = engine;
> -	rq->ctx = ctx;
> +	rq->ctx = i915_gem_context_get(ctx);

Putting in a short comment saying why the reference is needed would be good.

>   	rq->ring = ring;
>   
>   	/* No zalloc, must clear what we need by hand */
> @@ -777,6 +810,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   
>   err_unwind:
>   	rq->ring->emit = rq->head;
> +	i915_gem_context_put(ctx);
>   
>   	/* Make sure we didn't add ourselves to external state before freeing */
>   	GEM_BUG_ON(!list_empty(&rq->active_list));
> @@ -1357,38 +1391,30 @@ long i915_request_wait(struct i915_request *rq,
>   	return timeout;
>   }
>   
> -static void engine_retire_requests(struct intel_engine_cs *engine)
> +static void ring_retire_requests(struct intel_ring *ring)
>   {
>   	struct i915_request *request, *next;
> -	u32 seqno = intel_engine_get_seqno(engine);
> -	LIST_HEAD(retire);
>   
> -	spin_lock_irq(&engine->timeline->lock);
>   	list_for_each_entry_safe(request, next,
> -				 &engine->timeline->requests, link) {
> -		if (!i915_seqno_passed(seqno, request->global_seqno))
> +				 &ring->request_list, ring_link) {
> +		if (!i915_request_completed(request))
>   			break;
>   
> -		list_move_tail(&request->link, &retire);
> -	}
> -	spin_unlock_irq(&engine->timeline->lock);
> -
> -	list_for_each_entry_safe(request, next, &retire, link)
>   		i915_request_retire(request);
> +	}
>   }
>   
>   void i915_retire_requests(struct drm_i915_private *i915)
>   {
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> +	struct intel_ring *ring, *next;
>   
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   
>   	if (!i915->gt.active_requests)
>   		return;
>   
> -	for_each_engine(engine, i915, id)
> -		engine_retire_requests(engine);
> +	list_for_each_entry_safe(ring, next, &i915->gt.rings, link)
> +		ring_retire_requests(ring);
>   }
>   
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> index 0695717522ea..00165ad55fb3 100644
> --- a/drivers/gpu/drm/i915/i915_utils.h
> +++ b/drivers/gpu/drm/i915/i915_utils.h
> @@ -120,6 +120,12 @@ static inline u64 ptr_to_u64(const void *ptr)
>   
>   #include <linux/list.h>
>   
> +static inline int list_is_first(const struct list_head *list,
> +				const struct list_head *head)
> +{
> +	return head->next == list;
> +}
> +
>   static inline void __list_del_many(struct list_head *head,
>   				   struct list_head *first)
>   {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index c68ac605b8a9..75dca28782ee 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1065,7 +1065,6 @@ int intel_ring_pin(struct intel_ring *ring,
>   
>   void intel_ring_reset(struct intel_ring *ring, u32 tail)
>   {
> -	GEM_BUG_ON(!list_empty(&ring->request_list));
>   	ring->tail = tail;
>   	ring->head = tail;
>   	ring->emit = tail;
> @@ -1124,6 +1123,7 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
>   
>   	GEM_BUG_ON(!is_power_of_2(size));
>   	GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
> +	lockdep_assert_held(&engine->i915->drm.struct_mutex);
>   
>   	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
>   	if (!ring)
> @@ -1149,6 +1149,8 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size)
>   	}
>   	ring->vma = vma;
>   
> +	list_add(&ring->link, &engine->i915->gt.rings);
> +
>   	return ring;
>   }
>   
> @@ -1160,6 +1162,8 @@ intel_ring_free(struct intel_ring *ring)
>   	i915_vma_close(ring->vma);
>   	__i915_gem_object_release_unless_active(obj);
>   
> +	list_del(&ring->link);
> +
>   	kfree(ring);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c5e27905b0e1..d816f8dea245 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -129,6 +129,7 @@ struct intel_ring {
>   	void *vaddr;
>   
>   	struct list_head request_list;
> +	struct list_head link;
>   
>   	u32 head;
>   	u32 tail;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 78a89efa1119..a0bc324edadd 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -140,9 +140,18 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
>   	INIT_LIST_HEAD(&ring->request_list);
>   	intel_ring_update_space(ring);
>   
> +	list_add(&ring->link, &engine->i915->gt.rings);
> +
>   	return ring;
>   }
>   
> +static void mock_ring_free(struct intel_ring *ring)
> +{
> +	list_del(&ring->link);
> +
> +	kfree(ring);
> +}
> +
>   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   				    const char *name,
>   				    int id)
> @@ -155,12 +164,6 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	if (!engine)
>   		return NULL;
>   
> -	engine->base.buffer = mock_ring(&engine->base);
> -	if (!engine->base.buffer) {
> -		kfree(engine);
> -		return NULL;
> -	}
> -
>   	/* minimal engine setup for requests */
>   	engine->base.i915 = i915;
>   	snprintf(engine->base.name, sizeof(engine->base.name), "%s", name);
> @@ -185,7 +188,16 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	timer_setup(&engine->hw_delay, hw_delay_complete, 0);
>   	INIT_LIST_HEAD(&engine->hw_queue);
>   
> +	engine->base.buffer = mock_ring(&engine->base);
> +	if (!engine->base.buffer)
> +		goto err_breadcrumbs;
> +
>   	return &engine->base;
> +
> +err_breadcrumbs:
> +	intel_engine_fini_breadcrumbs(&engine->base);
> +	kfree(engine);
> +	return NULL;
>   }
>   
>   void mock_engine_flush(struct intel_engine_cs *engine)
> @@ -219,8 +231,9 @@ void mock_engine_free(struct intel_engine_cs *engine)
>   	if (engine->last_retired_context)
>   		engine->context_unpin(engine, engine->last_retired_context);
>   
> +	mock_ring_free(engine->buffer);
> +
>   	intel_engine_fini_breadcrumbs(engine);
>   
> -	kfree(engine->buffer);
>   	kfree(engine);
>   }
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index e6d4b882599a..ac4bacf8b5b9 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -44,6 +44,7 @@ void mock_device_flush(struct drm_i915_private *i915)
>   		mock_engine_flush(engine);
>   
>   	i915_retire_requests(i915);
> +	GEM_BUG_ON(i915->gt.active_requests);
>   }
>   
>   static void mock_device_release(struct drm_device *dev)
> @@ -224,6 +225,7 @@ struct drm_i915_private *mock_gem_device(void)
>   		goto err_dependencies;
>   
>   	mutex_lock(&i915->drm.struct_mutex);
> +	INIT_LIST_HEAD(&i915->gt.rings);
>   	INIT_LIST_HEAD(&i915->gt.timelines);
>   	err = i915_gem_timeline_init__global(i915);
>   	if (err) {
> 

Sorry I did not spot the problem yet. Still in the phase of trying to 
understand the approach.

Regards,

Tvrtko
Tvrtko Ursulin April 24, 2018, 2:56 p.m. UTC | #3
On 24/04/2018 15:46, Tvrtko Ursulin wrote:
> 
> On 24/04/2018 14:14, Chris Wilson wrote:
>> In the next patch, rings are the central timeline as requests may jump
>> between engines. Therefore in the future as we retire in order along the
>> engine timeline, we may retire out-of-order within a ring (as the ring 
>> now
>> occurs along multiple engines), leading to much hilarity in miscomputing
>> the position of ring->head.
>>
>> As an added bonus, retiring along the ring reduces the penalty of having
>> one execlists client do cleanup for another (old legacy submission
>> shares a ring between all clients). The downside is that slow and
>> irregular (off the critical path) process of cleaning up stale requests
>> after userspace becomes a modicum less efficient.
>>
>> In the long run, it will become apparent that the ordered
>> ring->request_list matches the ring->timeline, a fun challenge for the
>> future will be unifying the two lists to avoid duplication!
>>
>> v2: We need both engine-order and ring-order processing to maintain our
>> knowledge of where individual rings have completed upto as well as
>> knowing what was last executing on any engine. And finally by decoupling
>> retiring the contexts on the engine and the timelines along the rings,
>> we do have to keep a reference to the context on each request
>> (previously it was guaranteed by the context being pinned).
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h               |   3 +-
>>   drivers/gpu/drm/i915/i915_gem.c               |   1 +
>>   drivers/gpu/drm/i915/i915_request.c           | 140 +++++++++++-------
>>   drivers/gpu/drm/i915/i915_utils.h             |   6 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.c       |   6 +-
>>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   1 +
>>   drivers/gpu/drm/i915/selftests/mock_engine.c  |  27 +++-
>>   .../gpu/drm/i915/selftests/mock_gem_device.c  |   2 +
>>   8 files changed, 120 insertions(+), 66 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 8fd9fb6efba5..1837c01d44d0 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2058,8 +2058,9 @@ struct drm_i915_private {
>>           void (*resume)(struct drm_i915_private *);
>>           void (*cleanup_engine)(struct intel_engine_cs *engine);
>> -        struct list_head timelines;
>>           struct i915_gem_timeline global_timeline;
>> +        struct list_head timelines;
>> +        struct list_head rings;
>>           u32 active_requests;
>>           u32 request_serial;
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 795ca83aed7a..906e2395c245 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -5600,6 +5600,7 @@ int i915_gem_init_early(struct drm_i915_private 
>> *dev_priv)
>>           goto err_dependencies;
>>       mutex_lock(&dev_priv->drm.struct_mutex);
>> +    INIT_LIST_HEAD(&dev_priv->gt.rings);
>>       INIT_LIST_HEAD(&dev_priv->gt.timelines);
>>       err = i915_gem_timeline_init__global(dev_priv);
>>       mutex_unlock(&dev_priv->drm.struct_mutex);
>> diff --git a/drivers/gpu/drm/i915/i915_request.c 
>> b/drivers/gpu/drm/i915/i915_request.c
>> index b1993d4a1a53..aa84b5447fd5 100644
>> --- a/drivers/gpu/drm/i915/i915_request.c
>> +++ b/drivers/gpu/drm/i915/i915_request.c
>> @@ -286,6 +286,7 @@ static int reserve_gt(struct drm_i915_private *i915)
>>   static void unreserve_gt(struct drm_i915_private *i915)
>>   {
>> +    GEM_BUG_ON(!i915->gt.active_requests);
>>       if (!--i915->gt.active_requests)
>>           i915_gem_park(i915);
>>   }
>> @@ -298,6 +299,7 @@ void i915_gem_retire_noop(struct i915_gem_active 
>> *active,
>>   static void advance_ring(struct i915_request *request)
>>   {
>> +    struct intel_ring *ring = request->ring;
>>       unsigned int tail;
>>       /*
>> @@ -309,7 +311,8 @@ static void advance_ring(struct i915_request 
>> *request)
>>        * Note this requires that we are always called in request
>>        * completion order.
>>        */
>> -    if (list_is_last(&request->ring_link, 
>> &request->ring->request_list)) {
>> +    GEM_BUG_ON(!list_is_first(&request->ring_link, 
>> &ring->request_list));
>> +    if (list_is_last(&request->ring_link, &ring->request_list)) {
>>           /*
>>            * We may race here with execlists resubmitting this request
>>            * as we retire it. The resubmission will move the ring->tail
>> @@ -322,9 +325,9 @@ static void advance_ring(struct i915_request 
>> *request)
>>       } else {
>>           tail = request->postfix;
>>       }
>> -    list_del(&request->ring_link);
>> +    list_del_init(&request->ring_link);
> 
> Why the _init flavour? There are two list heads for being on two 
> separate lists, but are either path reachable after being removed from 
> the respective lists? (I may find the answer as I read on.)
> 
>> -    request->ring->head = tail;
>> +    ring->head = tail;
>>   }
>>   static void free_capture_list(struct i915_request *request)
>> @@ -340,30 +343,79 @@ static void free_capture_list(struct 
>> i915_request *request)
>>       }
>>   }
>> +static void __retire_engine_request(struct intel_engine_cs *engine,
>> +                    struct i915_request *request)
>> +{
>> +    GEM_BUG_ON(!i915_request_completed(request));
>> +
>> +    local_irq_disable();
>> +
>> +    spin_lock(&engine->timeline->lock);
>> +    GEM_BUG_ON(!list_is_first(&request->link, 
>> &engine->timeline->requests));
>> +    list_del_init(&request->link);
>> +    spin_unlock(&engine->timeline->lock);
>> +
>> +    spin_lock(&request->lock);
>> +    if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
>> +        dma_fence_signal_locked(&request->fence);
>> +    if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 
>> &request->fence.flags))
>> +        intel_engine_cancel_signaling(request);
>> +    if (request->waitboost) {
>> +        GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
>> +        atomic_dec(&request->i915->gt_pm.rps.num_waiters);
>> +    }
>> +    spin_unlock(&request->lock);
>> +
>> +    local_irq_enable();
>> +
>> +    /*
>> +     * The backing object for the context is done after switching to the
>> +     * *next* context. Therefore we cannot retire the previous 
>> context until
>> +     * the next context has already started running. However, since we
>> +     * cannot take the required locks at i915_request_submit() we
>> +     * defer the unpinning of the active context to now, retirement of
>> +     * the subsequent request.
>> +     */
>> +    if (engine->last_retired_context)
>> +        engine->context_unpin(engine, engine->last_retired_context);
>> +    engine->last_retired_context = request->ctx;
>> +}
>> +
>> +static void __retire_engine_upto(struct intel_engine_cs *engine,
>> +                 struct i915_request *rq)
>> +{
>> +    struct i915_request *tmp;
>> +
>> +    GEM_BUG_ON(!i915_request_completed(rq));
>> +    if (list_empty(&rq->link))
>> +        return;
>> +
>> +    do {
>> +        tmp = list_first_entry(&engine->timeline->requests,
>> +                       typeof(*tmp), link);
>> +
>> +        GEM_BUG_ON(tmp->engine != engine);
>> +        __retire_engine_request(engine, tmp);
>> +    } while (tmp != rq);
>> +}
>> +
>>   static void i915_request_retire(struct i915_request *request)
>>   {
>> -    struct intel_engine_cs *engine = request->engine;
>>       struct i915_gem_active *active, *next;
>>       GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n",
>> -          engine->name,
>> +          request->engine->name,
>>             request->fence.context, request->fence.seqno,
>>             request->global_seqno,
>> -          intel_engine_get_seqno(engine));
>> +          intel_engine_get_seqno(request->engine));
>>       lockdep_assert_held(&request->i915->drm.struct_mutex);
>>       GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
>>       GEM_BUG_ON(!i915_request_completed(request));
>> -    GEM_BUG_ON(!request->i915->gt.active_requests);
>>       trace_i915_request_retire(request);
>> -    spin_lock_irq(&engine->timeline->lock);
>> -    list_del_init(&request->link);
>> -    spin_unlock_irq(&engine->timeline->lock);
>> -
>>       advance_ring(request);
>> -
>>       free_capture_list(request);
>>       /*
>> @@ -400,49 +452,30 @@ static void i915_request_retire(struct 
>> i915_request *request)
>>       /* Retirement decays the ban score as it is a sign of ctx 
>> progress */
>>       atomic_dec_if_positive(&request->ctx->ban_score);
>> -    /*
>> -     * The backing object for the context is done after switching to the
>> -     * *next* context. Therefore we cannot retire the previous 
>> context until
>> -     * the next context has already started running. However, since we
>> -     * cannot take the required locks at i915_request_submit() we
>> -     * defer the unpinning of the active context to now, retirement of
>> -     * the subsequent request.
>> -     */
>> -    if (engine->last_retired_context)
>> -        engine->context_unpin(engine, engine->last_retired_context);
>> -    engine->last_retired_context = request->ctx;
>> -
>> -    spin_lock_irq(&request->lock);
>> -    if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
>> -        dma_fence_signal_locked(&request->fence);
>> -    if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 
>> &request->fence.flags))
>> -        intel_engine_cancel_signaling(request);
>> -    if (request->waitboost) {
>> -        GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
>> -        atomic_dec(&request->i915->gt_pm.rps.num_waiters);
>> -    }
>> -    spin_unlock_irq(&request->lock);
>> +    __retire_engine_upto(request->engine, request);
> 
> I did not figure out why do you need to do this. Normally retirement 
> driven from ring timelines would retire requests on the same engine 
> belonging to a different ring, as it walks over ring timelines.
> 
> Only for direct callers of i915_request_retire it may make a difference 
> but I don't understand is it an functional difference or just optimisation.

Context pinning of course needs to be in engine order!

Regards,

Tvrtko


> This is then from where list_del_init comes from, since this function 
> can retire wider than the caller would expect. But then it retires on 
> the engine (upto) and the callers just walks the list and calls retire 
> only to find already unlinked elements. Could it just as well then 
> retire it completely?
> 
>>       unreserve_gt(request->i915);
>>       i915_sched_node_fini(request->i915, &request->sched);
>> +
>> +    i915_gem_context_put(request->ctx);
>>       i915_request_put(request);
>>   }
>>   void i915_request_retire_upto(struct i915_request *rq)
>>   {
>> -    struct intel_engine_cs *engine = rq->engine;
>> +    struct intel_ring *ring = rq->ring;
>>       struct i915_request *tmp;
>>       lockdep_assert_held(&rq->i915->drm.struct_mutex);
>>       GEM_BUG_ON(!i915_request_completed(rq));
>> -    if (list_empty(&rq->link))
>> +    if (list_empty(&rq->ring_link))
>>           return;
>>       do {
>> -        tmp = list_first_entry(&engine->timeline->requests,
>> -                       typeof(*tmp), link);
>> +        tmp = list_first_entry(&ring->request_list,
>> +                       typeof(*tmp), ring_link);
>>           i915_request_retire(tmp);
>>       } while (tmp != rq);
>> @@ -651,9 +684,9 @@ i915_request_alloc(struct intel_engine_cs *engine, 
>> struct i915_gem_context *ctx)
>>       if (ret)
>>           goto err_unreserve;
>> -    /* Move the oldest request to the slab-cache (if not in use!) */
>> -    rq = list_first_entry_or_null(&engine->timeline->requests,
>> -                      typeof(*rq), link);
>> +    /* Move our oldest request to the slab-cache (if not in use!) */
>> +    rq = list_first_entry_or_null(&ring->request_list,
>> +                      typeof(*rq), ring_link);
> 
> This is less efficient now - I wonder if you should still be looking at 
> the engine timeline here?
> 
>>       if (rq && i915_request_completed(rq))
>>           i915_request_retire(rq);
>> @@ -733,7 +766,7 @@ i915_request_alloc(struct intel_engine_cs *engine, 
>> struct i915_gem_context *ctx)
>>       INIT_LIST_HEAD(&rq->active_list);
>>       rq->i915 = i915;
>>       rq->engine = engine;
>> -    rq->ctx = ctx;
>> +    rq->ctx = i915_gem_context_get(ctx);
> 
> Putting in a short comment saying why the reference is needed would be 
> good.
> 
>>       rq->ring = ring;
>>       /* No zalloc, must clear what we need by hand */
>> @@ -777,6 +810,7 @@ i915_request_alloc(struct intel_engine_cs *engine, 
>> struct i915_gem_context *ctx)
>>   err_unwind:
>>       rq->ring->emit = rq->head;
>> +    i915_gem_context_put(ctx);
>>       /* Make sure we didn't add ourselves to external state before 
>> freeing */
>>       GEM_BUG_ON(!list_empty(&rq->active_list));
>> @@ -1357,38 +1391,30 @@ long i915_request_wait(struct i915_request *rq,
>>       return timeout;
>>   }
>> -static void engine_retire_requests(struct intel_engine_cs *engine)
>> +static void ring_retire_requests(struct intel_ring *ring)
>>   {
>>       struct i915_request *request, *next;
>> -    u32 seqno = intel_engine_get_seqno(engine);
>> -    LIST_HEAD(retire);
>> -    spin_lock_irq(&engine->timeline->lock);
>>       list_for_each_entry_safe(request, next,
>> -                 &engine->timeline->requests, link) {
>> -        if (!i915_seqno_passed(seqno, request->global_seqno))
>> +                 &ring->request_list, ring_link) {
>> +        if (!i915_request_completed(request))
>>               break;
>> -        list_move_tail(&request->link, &retire);
>> -    }
>> -    spin_unlock_irq(&engine->timeline->lock);
>> -
>> -    list_for_each_entry_safe(request, next, &retire, link)
>>           i915_request_retire(request);
>> +    }
>>   }
>>   void i915_retire_requests(struct drm_i915_private *i915)
>>   {
>> -    struct intel_engine_cs *engine;
>> -    enum intel_engine_id id;
>> +    struct intel_ring *ring, *next;
>>       lockdep_assert_held(&i915->drm.struct_mutex);
>>       if (!i915->gt.active_requests)
>>           return;
>> -    for_each_engine(engine, i915, id)
>> -        engine_retire_requests(engine);
>> +    list_for_each_entry_safe(ring, next, &i915->gt.rings, link)
>> +        ring_retire_requests(ring);
>>   }
>>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>> diff --git a/drivers/gpu/drm/i915/i915_utils.h 
>> b/drivers/gpu/drm/i915/i915_utils.h
>> index 0695717522ea..00165ad55fb3 100644
>> --- a/drivers/gpu/drm/i915/i915_utils.h
>> +++ b/drivers/gpu/drm/i915/i915_utils.h
>> @@ -120,6 +120,12 @@ static inline u64 ptr_to_u64(const void *ptr)
>>   #include <linux/list.h>
>> +static inline int list_is_first(const struct list_head *list,
>> +                const struct list_head *head)
>> +{
>> +    return head->next == list;
>> +}
>> +
>>   static inline void __list_del_many(struct list_head *head,
>>                      struct list_head *first)
>>   {
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
>> b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index c68ac605b8a9..75dca28782ee 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -1065,7 +1065,6 @@ int intel_ring_pin(struct intel_ring *ring,
>>   void intel_ring_reset(struct intel_ring *ring, u32 tail)
>>   {
>> -    GEM_BUG_ON(!list_empty(&ring->request_list));
>>       ring->tail = tail;
>>       ring->head = tail;
>>       ring->emit = tail;
>> @@ -1124,6 +1123,7 @@ intel_engine_create_ring(struct intel_engine_cs 
>> *engine, int size)
>>       GEM_BUG_ON(!is_power_of_2(size));
>>       GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
>> +    lockdep_assert_held(&engine->i915->drm.struct_mutex);
>>       ring = kzalloc(sizeof(*ring), GFP_KERNEL);
>>       if (!ring)
>> @@ -1149,6 +1149,8 @@ intel_engine_create_ring(struct intel_engine_cs 
>> *engine, int size)
>>       }
>>       ring->vma = vma;
>> +    list_add(&ring->link, &engine->i915->gt.rings);
>> +
>>       return ring;
>>   }
>> @@ -1160,6 +1162,8 @@ intel_ring_free(struct intel_ring *ring)
>>       i915_vma_close(ring->vma);
>>       __i915_gem_object_release_unless_active(obj);
>> +    list_del(&ring->link);
>> +
>>       kfree(ring);
>>   }
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
>> b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index c5e27905b0e1..d816f8dea245 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -129,6 +129,7 @@ struct intel_ring {
>>       void *vaddr;
>>       struct list_head request_list;
>> +    struct list_head link;
>>       u32 head;
>>       u32 tail;
>> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c 
>> b/drivers/gpu/drm/i915/selftests/mock_engine.c
>> index 78a89efa1119..a0bc324edadd 100644
>> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
>> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
>> @@ -140,9 +140,18 @@ static struct intel_ring *mock_ring(struct 
>> intel_engine_cs *engine)
>>       INIT_LIST_HEAD(&ring->request_list);
>>       intel_ring_update_space(ring);
>> +    list_add(&ring->link, &engine->i915->gt.rings);
>> +
>>       return ring;
>>   }
>> +static void mock_ring_free(struct intel_ring *ring)
>> +{
>> +    list_del(&ring->link);
>> +
>> +    kfree(ring);
>> +}
>> +
>>   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>>                       const char *name,
>>                       int id)
>> @@ -155,12 +164,6 @@ struct intel_engine_cs *mock_engine(struct 
>> drm_i915_private *i915,
>>       if (!engine)
>>           return NULL;
>> -    engine->base.buffer = mock_ring(&engine->base);
>> -    if (!engine->base.buffer) {
>> -        kfree(engine);
>> -        return NULL;
>> -    }
>> -
>>       /* minimal engine setup for requests */
>>       engine->base.i915 = i915;
>>       snprintf(engine->base.name, sizeof(engine->base.name), "%s", name);
>> @@ -185,7 +188,16 @@ struct intel_engine_cs *mock_engine(struct 
>> drm_i915_private *i915,
>>       timer_setup(&engine->hw_delay, hw_delay_complete, 0);
>>       INIT_LIST_HEAD(&engine->hw_queue);
>> +    engine->base.buffer = mock_ring(&engine->base);
>> +    if (!engine->base.buffer)
>> +        goto err_breadcrumbs;
>> +
>>       return &engine->base;
>> +
>> +err_breadcrumbs:
>> +    intel_engine_fini_breadcrumbs(&engine->base);
>> +    kfree(engine);
>> +    return NULL;
>>   }
>>   void mock_engine_flush(struct intel_engine_cs *engine)
>> @@ -219,8 +231,9 @@ void mock_engine_free(struct intel_engine_cs *engine)
>>       if (engine->last_retired_context)
>>           engine->context_unpin(engine, engine->last_retired_context);
>> +    mock_ring_free(engine->buffer);
>> +
>>       intel_engine_fini_breadcrumbs(engine);
>> -    kfree(engine->buffer);
>>       kfree(engine);
>>   }
>> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c 
>> b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
>> index e6d4b882599a..ac4bacf8b5b9 100644
>> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
>> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
>> @@ -44,6 +44,7 @@ void mock_device_flush(struct drm_i915_private *i915)
>>           mock_engine_flush(engine);
>>       i915_retire_requests(i915);
>> +    GEM_BUG_ON(i915->gt.active_requests);
>>   }
>>   static void mock_device_release(struct drm_device *dev)
>> @@ -224,6 +225,7 @@ struct drm_i915_private *mock_gem_device(void)
>>           goto err_dependencies;
>>       mutex_lock(&i915->drm.struct_mutex);
>> +    INIT_LIST_HEAD(&i915->gt.rings);
>>       INIT_LIST_HEAD(&i915->gt.timelines);
>>       err = i915_gem_timeline_init__global(i915);
>>       if (err) {
>>
> 
> Sorry I did not spot the problem yet. Still in the phase of trying to 
> understand the approach.
> 
> Regards,
> 
> Tvrtko
Chris Wilson April 24, 2018, 2:57 p.m. UTC | #4
Quoting Tvrtko Ursulin (2018-04-24 15:46:49)
> 
> On 24/04/2018 14:14, Chris Wilson wrote:
> > In the next patch, rings are the central timeline as requests may jump
> > between engines. Therefore in the future as we retire in order along the
> > engine timeline, we may retire out-of-order within a ring (as the ring now
> > occurs along multiple engines), leading to much hilarity in miscomputing
> > the position of ring->head.
> > 
> > As an added bonus, retiring along the ring reduces the penalty of having
> > one execlists client do cleanup for another (old legacy submission
> > shares a ring between all clients). The downside is that slow and
> > irregular (off the critical path) process of cleaning up stale requests
> > after userspace becomes a modicum less efficient.
> > 
> > In the long run, it will become apparent that the ordered
> > ring->request_list matches the ring->timeline, a fun challenge for the
> > future will be unifying the two lists to avoid duplication!
> > 
> > v2: We need both engine-order and ring-order processing to maintain our
> > knowledge of where individual rings have completed upto as well as
> > knowing what was last executing on any engine. And finally by decoupling
> > retiring the contexts on the engine and the timelines along the rings,
> > we do have to keep a reference to the context on each request
> > (previously it was guaranteed by the context being pinned).
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> > @@ -322,9 +325,9 @@ static void advance_ring(struct i915_request *request)
> >       } else {
> >               tail = request->postfix;
> >       }
> > -     list_del(&request->ring_link);
> > +     list_del_init(&request->ring_link);
> 
> Why the _init flavour? There are two list heads for being on two 
> separate lists, but are either path reachable after being removed from 
> the respective lists? (I may find the answer as I read on.)

Yes. rq are held elsewhere and may end up being individually retired
(via the retire_upto paths) multiple times. i915_request_retire() should
only be called once (otherwise asserts).

So init + list_empty() is being used to prevent retiring the same
request multiple times.

> > -     if (engine->last_retired_context)
> > -             engine->context_unpin(engine, engine->last_retired_context);
> > -     engine->last_retired_context = request->ctx;
> > -
> > -     spin_lock_irq(&request->lock);
> > -     if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
> > -             dma_fence_signal_locked(&request->fence);
> > -     if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
> > -             intel_engine_cancel_signaling(request);
> > -     if (request->waitboost) {
> > -             GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
> > -             atomic_dec(&request->i915->gt_pm.rps.num_waiters);
> > -     }
> > -     spin_unlock_irq(&request->lock);
> > +     __retire_engine_upto(request->engine, request);
> >   
> 
> I did not figure out why do you need to do this. Normally retirement 
> driven from ring timelines would retire requests on the same engine 
> belonging to a different ring, as it walks over ring timelines.

Right, so retiring along a ring ends up out of order for a particular
engine.
 
> Only for direct callers of i915_request_retire it may make a difference 
> but I don't understand is it an functional difference or just optimisation.

I was hoping the GEM_BUG_ON(!list_is_first()); explained the rationale.

> This is then from where list_del_init comes from, since this function 
> can retire wider than the caller would expect. But then it retires on 
> the engine (upto) and the callers just walks the list and calls retire 
> only to find already unlinked elements. Could it just as well then 
> retire it completely?

We're still trying to only do as little work as we can get away with.

> > -     /* Move the oldest request to the slab-cache (if not in use!) */
> > -     rq = list_first_entry_or_null(&engine->timeline->requests,
> > -                                   typeof(*rq), link);
> > +     /* Move our oldest request to the slab-cache (if not in use!) */
> > +     rq = list_first_entry_or_null(&ring->request_list,
> > +                                   typeof(*rq), ring_link);
> 
> This is less efficient now - I wonder if you should still be looking at 
> the engine timeline here?

No, either way you have to walk all engine->requests upto this point, or
all ring->requests. To keep the interface manageable, retirement is in
ring order.

On the other hand, we don't retire as often if we can get away with it.
1 step forwards, 1 step backwards.
-Chris
Tvrtko Ursulin April 25, 2018, 9:30 a.m. UTC | #5
On 24/04/2018 15:57, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-04-24 15:46:49)
>>
>> On 24/04/2018 14:14, Chris Wilson wrote:
>>> In the next patch, rings are the central timeline as requests may jump
>>> between engines. Therefore in the future as we retire in order along the
>>> engine timeline, we may retire out-of-order within a ring (as the ring now
>>> occurs along multiple engines), leading to much hilarity in miscomputing
>>> the position of ring->head.
>>>
>>> As an added bonus, retiring along the ring reduces the penalty of having
>>> one execlists client do cleanup for another (old legacy submission
>>> shares a ring between all clients). The downside is that slow and
>>> irregular (off the critical path) process of cleaning up stale requests
>>> after userspace becomes a modicum less efficient.
>>>
>>> In the long run, it will become apparent that the ordered
>>> ring->request_list matches the ring->timeline, a fun challenge for the
>>> future will be unifying the two lists to avoid duplication!
>>>
>>> v2: We need both engine-order and ring-order processing to maintain our
>>> knowledge of where individual rings have completed upto as well as
>>> knowing what was last executing on any engine. And finally by decoupling
>>> retiring the contexts on the engine and the timelines along the rings,
>>> we do have to keep a reference to the context on each request
>>> (previously it was guaranteed by the context being pinned).
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>> @@ -322,9 +325,9 @@ static void advance_ring(struct i915_request *request)
>>>        } else {
>>>                tail = request->postfix;
>>>        }
>>> -     list_del(&request->ring_link);
>>> +     list_del_init(&request->ring_link);
>>
>> Why the _init flavour? There are two list heads for being on two
>> separate lists, but are either path reachable after being removed from
>> the respective lists? (I may find the answer as I read on.)
> 
> Yes. rq are held elsewhere and may end up being individually retired
> (via the retire_upto paths) multiple times. i915_request_retire() should
> only be called once (otherwise asserts).
> 
> So init + list_empty() is being used to prevent retiring the same
> request multiple times.
> 
>>> -     if (engine->last_retired_context)
>>> -             engine->context_unpin(engine, engine->last_retired_context);
>>> -     engine->last_retired_context = request->ctx;
>>> -
>>> -     spin_lock_irq(&request->lock);
>>> -     if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
>>> -             dma_fence_signal_locked(&request->fence);
>>> -     if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
>>> -             intel_engine_cancel_signaling(request);
>>> -     if (request->waitboost) {
>>> -             GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
>>> -             atomic_dec(&request->i915->gt_pm.rps.num_waiters);
>>> -     }
>>> -     spin_unlock_irq(&request->lock);
>>> +     __retire_engine_upto(request->engine, request);
>>>    
>>
>> I did not figure out why do you need to do this. Normally retirement
>> driven from ring timelines would retire requests on the same engine
>> belonging to a different ring, as it walks over ring timelines.
> 
> Right, so retiring along a ring ends up out of order for a particular
> engine.
>   
>> Only for direct callers of i915_request_retire it may make a difference
>> but I don't understand is it an functional difference or just optimisation.
> 
> I was hoping the GEM_BUG_ON(!list_is_first()); explained the rationale.

Thanks, I remembered after pressing send.

>> This is then from where list_del_init comes from, since this function
>> can retire wider than the caller would expect. But then it retires on
>> the engine (upto) and the callers just walks the list and calls retire
>> only to find already unlinked elements. Could it just as well then
>> retire it completely?
> 
> We're still trying to only do as little work as we can get away with.

I was thinking about something else, but it was a flawed idea since two 
walks are crucial for correct ordering for both engine and ring retirement.

>>> -     /* Move the oldest request to the slab-cache (if not in use!) */
>>> -     rq = list_first_entry_or_null(&engine->timeline->requests,
>>> -                                   typeof(*rq), link);
>>> +     /* Move our oldest request to the slab-cache (if not in use!) */
>>> +     rq = list_first_entry_or_null(&ring->request_list,
>>> +                                   typeof(*rq), ring_link);
>>
>> This is less efficient now - I wonder if you should still be looking at
>> the engine timeline here?
> 
> No, either way you have to walk all engine->requests upto this point, or
> all ring->requests. To keep the interface manageable, retirement is in
> ring order.
> 
> On the other hand, we don't retire as often if we can get away with it.
> 1 step forwards, 1 step backwards.

AFAIR and AFAICS the point of this was to free up the oldest request 
just to help with request recycling as new ones are added. By now 
changing it to be oldest on the ring timeline it a) my not be very old, 
or b) not even completed, while there might be much older and completed 
requests on the respective engine timeline.

Hm, or are you saying that it could cause out of order for the ring? I 
guess it could yeah.. it would have to use the upto variant.

Related, there could be potential to unify i915_request_retire_upto and 
ring_retire_requests. Latter could pass in NULL as the upto request, 
just the completed check would need to be different depending on the mode.

Regards,

Tvrtko
Chris Wilson April 25, 2018, 9:36 a.m. UTC | #6
Quoting Tvrtko Ursulin (2018-04-25 10:30:51)
> Related, there could be potential to unify i915_request_retire_upto and 
> ring_retire_requests. Latter could pass in NULL as the upto request, 
> just the completed check would need to be different depending on the mode.

Also remember that _upto is the public variant, doing the individual
request retirement was kept private. As the ordering is paramount, the
public version needs to be retire iff first, or retire all/upto.

An attempt was made to have one interface that dtrt.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8fd9fb6efba5..1837c01d44d0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2058,8 +2058,9 @@  struct drm_i915_private {
 		void (*resume)(struct drm_i915_private *);
 		void (*cleanup_engine)(struct intel_engine_cs *engine);
 
-		struct list_head timelines;
 		struct i915_gem_timeline global_timeline;
+		struct list_head timelines;
+		struct list_head rings;
 		u32 active_requests;
 		u32 request_serial;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 795ca83aed7a..906e2395c245 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5600,6 +5600,7 @@  int i915_gem_init_early(struct drm_i915_private *dev_priv)
 		goto err_dependencies;
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
+	INIT_LIST_HEAD(&dev_priv->gt.rings);
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	err = i915_gem_timeline_init__global(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index b1993d4a1a53..aa84b5447fd5 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -286,6 +286,7 @@  static int reserve_gt(struct drm_i915_private *i915)
 
 static void unreserve_gt(struct drm_i915_private *i915)
 {
+	GEM_BUG_ON(!i915->gt.active_requests);
 	if (!--i915->gt.active_requests)
 		i915_gem_park(i915);
 }
@@ -298,6 +299,7 @@  void i915_gem_retire_noop(struct i915_gem_active *active,
 
 static void advance_ring(struct i915_request *request)
 {
+	struct intel_ring *ring = request->ring;
 	unsigned int tail;
 
 	/*
@@ -309,7 +311,8 @@  static void advance_ring(struct i915_request *request)
 	 * Note this requires that we are always called in request
 	 * completion order.
 	 */
-	if (list_is_last(&request->ring_link, &request->ring->request_list)) {
+	GEM_BUG_ON(!list_is_first(&request->ring_link, &ring->request_list));
+	if (list_is_last(&request->ring_link, &ring->request_list)) {
 		/*
 		 * We may race here with execlists resubmitting this request
 		 * as we retire it. The resubmission will move the ring->tail
@@ -322,9 +325,9 @@  static void advance_ring(struct i915_request *request)
 	} else {
 		tail = request->postfix;
 	}
-	list_del(&request->ring_link);
+	list_del_init(&request->ring_link);
 
-	request->ring->head = tail;
+	ring->head = tail;
 }
 
 static void free_capture_list(struct i915_request *request)
@@ -340,30 +343,79 @@  static void free_capture_list(struct i915_request *request)
 	}
 }
 
+static void __retire_engine_request(struct intel_engine_cs *engine,
+				    struct i915_request *request)
+{
+	GEM_BUG_ON(!i915_request_completed(request));
+
+	local_irq_disable();
+
+	spin_lock(&engine->timeline->lock);
+	GEM_BUG_ON(!list_is_first(&request->link, &engine->timeline->requests));
+	list_del_init(&request->link);
+	spin_unlock(&engine->timeline->lock);
+
+	spin_lock(&request->lock);
+	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
+		dma_fence_signal_locked(&request->fence);
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
+		intel_engine_cancel_signaling(request);
+	if (request->waitboost) {
+		GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
+		atomic_dec(&request->i915->gt_pm.rps.num_waiters);
+	}
+	spin_unlock(&request->lock);
+
+	local_irq_enable();
+
+	/*
+	 * The backing object for the context is done after switching to the
+	 * *next* context. Therefore we cannot retire the previous context until
+	 * the next context has already started running. However, since we
+	 * cannot take the required locks at i915_request_submit() we
+	 * defer the unpinning of the active context to now, retirement of
+	 * the subsequent request.
+	 */
+	if (engine->last_retired_context)
+		engine->context_unpin(engine, engine->last_retired_context);
+	engine->last_retired_context = request->ctx;
+}
+
+static void __retire_engine_upto(struct intel_engine_cs *engine,
+				 struct i915_request *rq)
+{
+	struct i915_request *tmp;
+
+	GEM_BUG_ON(!i915_request_completed(rq));
+	if (list_empty(&rq->link))
+		return;
+
+	do {
+		tmp = list_first_entry(&engine->timeline->requests,
+				       typeof(*tmp), link);
+
+		GEM_BUG_ON(tmp->engine != engine);
+		__retire_engine_request(engine, tmp);
+	} while (tmp != rq);
+}
+
 static void i915_request_retire(struct i915_request *request)
 {
-	struct intel_engine_cs *engine = request->engine;
 	struct i915_gem_active *active, *next;
 
 	GEM_TRACE("%s fence %llx:%d, global=%d, current %d\n",
-		  engine->name,
+		  request->engine->name,
 		  request->fence.context, request->fence.seqno,
 		  request->global_seqno,
-		  intel_engine_get_seqno(engine));
+		  intel_engine_get_seqno(request->engine));
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
 	GEM_BUG_ON(!i915_request_completed(request));
-	GEM_BUG_ON(!request->i915->gt.active_requests);
 
 	trace_i915_request_retire(request);
 
-	spin_lock_irq(&engine->timeline->lock);
-	list_del_init(&request->link);
-	spin_unlock_irq(&engine->timeline->lock);
-
 	advance_ring(request);
-
 	free_capture_list(request);
 
 	/*
@@ -400,49 +452,30 @@  static void i915_request_retire(struct i915_request *request)
 	/* Retirement decays the ban score as it is a sign of ctx progress */
 	atomic_dec_if_positive(&request->ctx->ban_score);
 
-	/*
-	 * The backing object for the context is done after switching to the
-	 * *next* context. Therefore we cannot retire the previous context until
-	 * the next context has already started running. However, since we
-	 * cannot take the required locks at i915_request_submit() we
-	 * defer the unpinning of the active context to now, retirement of
-	 * the subsequent request.
-	 */
-	if (engine->last_retired_context)
-		engine->context_unpin(engine, engine->last_retired_context);
-	engine->last_retired_context = request->ctx;
-
-	spin_lock_irq(&request->lock);
-	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags))
-		dma_fence_signal_locked(&request->fence);
-	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
-		intel_engine_cancel_signaling(request);
-	if (request->waitboost) {
-		GEM_BUG_ON(!atomic_read(&request->i915->gt_pm.rps.num_waiters));
-		atomic_dec(&request->i915->gt_pm.rps.num_waiters);
-	}
-	spin_unlock_irq(&request->lock);
+	__retire_engine_upto(request->engine, request);
 
 	unreserve_gt(request->i915);
 
 	i915_sched_node_fini(request->i915, &request->sched);
+
+	i915_gem_context_put(request->ctx);
 	i915_request_put(request);
 }
 
 void i915_request_retire_upto(struct i915_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
+	struct intel_ring *ring = rq->ring;
 	struct i915_request *tmp;
 
 	lockdep_assert_held(&rq->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_request_completed(rq));
 
-	if (list_empty(&rq->link))
+	if (list_empty(&rq->ring_link))
 		return;
 
 	do {
-		tmp = list_first_entry(&engine->timeline->requests,
-				       typeof(*tmp), link);
+		tmp = list_first_entry(&ring->request_list,
+				       typeof(*tmp), ring_link);
 
 		i915_request_retire(tmp);
 	} while (tmp != rq);
@@ -651,9 +684,9 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	if (ret)
 		goto err_unreserve;
 
-	/* Move the oldest request to the slab-cache (if not in use!) */
-	rq = list_first_entry_or_null(&engine->timeline->requests,
-				      typeof(*rq), link);
+	/* Move our oldest request to the slab-cache (if not in use!) */
+	rq = list_first_entry_or_null(&ring->request_list,
+				      typeof(*rq), ring_link);
 	if (rq && i915_request_completed(rq))
 		i915_request_retire(rq);
 
@@ -733,7 +766,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	INIT_LIST_HEAD(&rq->active_list);
 	rq->i915 = i915;
 	rq->engine = engine;
-	rq->ctx = ctx;
+	rq->ctx = i915_gem_context_get(ctx);
 	rq->ring = ring;
 
 	/* No zalloc, must clear what we need by hand */
@@ -777,6 +810,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 
 err_unwind:
 	rq->ring->emit = rq->head;
+	i915_gem_context_put(ctx);
 
 	/* Make sure we didn't add ourselves to external state before freeing */
 	GEM_BUG_ON(!list_empty(&rq->active_list));
@@ -1357,38 +1391,30 @@  long i915_request_wait(struct i915_request *rq,
 	return timeout;
 }
 
-static void engine_retire_requests(struct intel_engine_cs *engine)
+static void ring_retire_requests(struct intel_ring *ring)
 {
 	struct i915_request *request, *next;
-	u32 seqno = intel_engine_get_seqno(engine);
-	LIST_HEAD(retire);
 
-	spin_lock_irq(&engine->timeline->lock);
 	list_for_each_entry_safe(request, next,
-				 &engine->timeline->requests, link) {
-		if (!i915_seqno_passed(seqno, request->global_seqno))
+				 &ring->request_list, ring_link) {
+		if (!i915_request_completed(request))
 			break;
 
-		list_move_tail(&request->link, &retire);
-	}
-	spin_unlock_irq(&engine->timeline->lock);
-
-	list_for_each_entry_safe(request, next, &retire, link)
 		i915_request_retire(request);
+	}
 }
 
 void i915_retire_requests(struct drm_i915_private *i915)
 {
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
+	struct intel_ring *ring, *next;
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
 
 	if (!i915->gt.active_requests)
 		return;
 
-	for_each_engine(engine, i915, id)
-		engine_retire_requests(engine);
+	list_for_each_entry_safe(ring, next, &i915->gt.rings, link)
+		ring_retire_requests(ring);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 0695717522ea..00165ad55fb3 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -120,6 +120,12 @@  static inline u64 ptr_to_u64(const void *ptr)
 
 #include <linux/list.h>
 
+static inline int list_is_first(const struct list_head *list,
+				const struct list_head *head)
+{
+	return head->next == list;
+}
+
 static inline void __list_del_many(struct list_head *head,
 				   struct list_head *first)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index c68ac605b8a9..75dca28782ee 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1065,7 +1065,6 @@  int intel_ring_pin(struct intel_ring *ring,
 
 void intel_ring_reset(struct intel_ring *ring, u32 tail)
 {
-	GEM_BUG_ON(!list_empty(&ring->request_list));
 	ring->tail = tail;
 	ring->head = tail;
 	ring->emit = tail;
@@ -1124,6 +1123,7 @@  intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 
 	GEM_BUG_ON(!is_power_of_2(size));
 	GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
+	lockdep_assert_held(&engine->i915->drm.struct_mutex);
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring)
@@ -1149,6 +1149,8 @@  intel_engine_create_ring(struct intel_engine_cs *engine, int size)
 	}
 	ring->vma = vma;
 
+	list_add(&ring->link, &engine->i915->gt.rings);
+
 	return ring;
 }
 
@@ -1160,6 +1162,8 @@  intel_ring_free(struct intel_ring *ring)
 	i915_vma_close(ring->vma);
 	__i915_gem_object_release_unless_active(obj);
 
+	list_del(&ring->link);
+
 	kfree(ring);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c5e27905b0e1..d816f8dea245 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -129,6 +129,7 @@  struct intel_ring {
 	void *vaddr;
 
 	struct list_head request_list;
+	struct list_head link;
 
 	u32 head;
 	u32 tail;
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 78a89efa1119..a0bc324edadd 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -140,9 +140,18 @@  static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
 	INIT_LIST_HEAD(&ring->request_list);
 	intel_ring_update_space(ring);
 
+	list_add(&ring->link, &engine->i915->gt.rings);
+
 	return ring;
 }
 
+static void mock_ring_free(struct intel_ring *ring)
+{
+	list_del(&ring->link);
+
+	kfree(ring);
+}
+
 struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 				    const char *name,
 				    int id)
@@ -155,12 +164,6 @@  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	if (!engine)
 		return NULL;
 
-	engine->base.buffer = mock_ring(&engine->base);
-	if (!engine->base.buffer) {
-		kfree(engine);
-		return NULL;
-	}
-
 	/* minimal engine setup for requests */
 	engine->base.i915 = i915;
 	snprintf(engine->base.name, sizeof(engine->base.name), "%s", name);
@@ -185,7 +188,16 @@  struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	timer_setup(&engine->hw_delay, hw_delay_complete, 0);
 	INIT_LIST_HEAD(&engine->hw_queue);
 
+	engine->base.buffer = mock_ring(&engine->base);
+	if (!engine->base.buffer)
+		goto err_breadcrumbs;
+
 	return &engine->base;
+
+err_breadcrumbs:
+	intel_engine_fini_breadcrumbs(&engine->base);
+	kfree(engine);
+	return NULL;
 }
 
 void mock_engine_flush(struct intel_engine_cs *engine)
@@ -219,8 +231,9 @@  void mock_engine_free(struct intel_engine_cs *engine)
 	if (engine->last_retired_context)
 		engine->context_unpin(engine, engine->last_retired_context);
 
+	mock_ring_free(engine->buffer);
+
 	intel_engine_fini_breadcrumbs(engine);
 
-	kfree(engine->buffer);
 	kfree(engine);
 }
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index e6d4b882599a..ac4bacf8b5b9 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -44,6 +44,7 @@  void mock_device_flush(struct drm_i915_private *i915)
 		mock_engine_flush(engine);
 
 	i915_retire_requests(i915);
+	GEM_BUG_ON(i915->gt.active_requests);
 }
 
 static void mock_device_release(struct drm_device *dev)
@@ -224,6 +225,7 @@  struct drm_i915_private *mock_gem_device(void)
 		goto err_dependencies;
 
 	mutex_lock(&i915->drm.struct_mutex);
+	INIT_LIST_HEAD(&i915->gt.rings);
 	INIT_LIST_HEAD(&i915->gt.timelines);
 	err = i915_gem_timeline_init__global(i915);
 	if (err) {