[08/10] drm/i915/gt: Track signaled breadcrumbs outside of the breadcrumb spinlock
diff mbox series

Message ID 20200720092312.16975-8-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/10] drm/i915/gem: Remove disordered per-file request list for throttling
Related show

Commit Message

Chris Wilson July 20, 2020, 9:23 a.m. UTC
Make b->signaled_requests a lockless-list so that we can manipulate it
outside of the b->irq_lock.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 42 +++++++++----------
 .../gpu/drm/i915/gt/intel_breadcrumbs_types.h |  2 +-
 drivers/gpu/drm/i915/i915_request.h           |  6 ++-
 3 files changed, 26 insertions(+), 24 deletions(-)

Comments

Tvrtko Ursulin July 22, 2020, 1:26 p.m. UTC | #1
On 20/07/2020 10:23, Chris Wilson wrote:
> Make b->signaled_requests a lockless-list so that we can manipulate it
> outside of the b->irq_lock.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 42 +++++++++----------
>   .../gpu/drm/i915/gt/intel_breadcrumbs_types.h |  2 +-
>   drivers/gpu/drm/i915/i915_request.h           |  6 ++-
>   3 files changed, 26 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> index 59e8cd505569..2b3ad17c63b9 100644
> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> @@ -175,32 +175,23 @@ static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl)
>   		intel_engine_add_retire(b->irq_engine, tl);
>   }
>   
> -static bool __signal_request(struct i915_request *rq, struct list_head *signals)
> -{
> -	clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> -
> -	if (!__dma_fence_signal(&rq->fence))
> -		return false;
> -
> -	list_add_tail(&rq->signal_link, signals);
> -	return true;
> -}
> -
>   static void signal_irq_work(struct irq_work *work)
>   {
>   	struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work);
>   	const ktime_t timestamp = ktime_get();
> +	struct llist_node *signal, *sn;
>   	struct intel_context *ce, *cn;
>   	struct list_head *pos, *next;
> -	LIST_HEAD(signal);
> +
> +	signal = NULL;
> +	if (!llist_empty(&b->signaled_requests))
> +		signal = llist_del_all(&b->signaled_requests);

Uncoditional llist_del_all? It's not likely list will be empty and if it 
is llist_del_all will return NULL.

>   
>   	spin_lock(&b->irq_lock);
>   
> -	if (b->irq_armed && list_empty(&b->signalers))
> +	if (!signal && b->irq_armed && list_empty(&b->signalers))

Why !signal check in here? Couldn't figure out what changed to make this 
needed.

>   		__intel_breadcrumbs_disarm_irq(b);
>   
> -	list_splice_init(&b->signaled_requests, &signal);
> -
>   	list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) {
>   		GEM_BUG_ON(list_empty(&ce->signals));
>   
> @@ -217,8 +208,13 @@ static void signal_irq_work(struct irq_work *work)
>   			 * spinlock as the callback chain may end up adding
>   			 * more signalers to the same context or engine.
>   			 */
> -			if (!__signal_request(rq, &signal))
> +			clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> +			if (__dma_fence_signal(&rq->fence)) {
> +				rq->signal_node.next = signal;
> +				signal = &rq->signal_node;

Okay it creates a bit of a differently ordered list like this but I 
think it doesn't matter.

> +			} else {
>   				i915_request_put(rq);
> +			}
>   		}
>   
>   		/*
> @@ -238,9 +234,9 @@ static void signal_irq_work(struct irq_work *work)
>   
>   	spin_unlock(&b->irq_lock);
>   
> -	list_for_each_safe(pos, next, &signal) {
> +	llist_for_each_safe(signal, sn, signal) {
>   		struct i915_request *rq =
> -			list_entry(pos, typeof(*rq), signal_link);
> +			llist_entry(signal, typeof(*rq), signal_node);
>   		struct list_head cb_list;
>   
>   		spin_lock(&rq->lock);
> @@ -264,7 +260,7 @@ intel_breadcrumbs_create(struct intel_engine_cs *irq_engine)
>   
>   	spin_lock_init(&b->irq_lock);
>   	INIT_LIST_HEAD(&b->signalers);
> -	INIT_LIST_HEAD(&b->signaled_requests);
> +	init_llist_head(&b->signaled_requests);
>   
>   	init_irq_work(&b->irq_work, signal_irq_work);
>   
> @@ -329,10 +325,12 @@ static void insert_breadcrumb(struct i915_request *rq,
>   	 * its signal completion.
>   	 */
>   	if (__request_completed(rq)) {
> -		if (__signal_request(rq, &b->signaled_requests))
> -			irq_work_queue(&b->irq_work);
> -		else
> +		if (__dma_fence_signal(&rq->fence)) {
> +			if (llist_add(&rq->signal_node, &b->signaled_requests))
> +				irq_work_queue(&b->irq_work);
> +		} else {
>   			i915_request_put(rq);
> +		}
>   		return;
>   	}
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
> index 8e53b9942695..3fa19820b37a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
> @@ -35,7 +35,7 @@ struct intel_breadcrumbs {
>   	struct intel_engine_cs *irq_engine;
>   
>   	struct list_head signalers;
> -	struct list_head signaled_requests;
> +	struct llist_head signaled_requests;
>   
>   	struct irq_work irq_work; /* for use from inside irq_lock */
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 16b721080195..874af6db6103 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -176,7 +176,11 @@ struct i915_request {
>   	struct intel_context *context;
>   	struct intel_ring *ring;
>   	struct intel_timeline __rcu *timeline;
> -	struct list_head signal_link;
> +
> +	union {
> +		struct list_head signal_link;
> +		struct llist_node signal_node;

Transition is only from signal_link to signal_node, which uses and 
initializes only one field, so I think potential garbage in other ones 
is safe.

> +	};
>   
>   	/*
>   	 * The rcu epoch of when this request was allocated. Used to judiciously
> 

Regards,

Tvrtko
Chris Wilson July 22, 2020, 1:42 p.m. UTC | #2
Quoting Tvrtko Ursulin (2020-07-22 14:26:36)
> 
> On 20/07/2020 10:23, Chris Wilson wrote:
> > Make b->signaled_requests a lockless-list so that we can manipulate it
> > outside of the b->irq_lock.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/gt/intel_breadcrumbs.c   | 42 +++++++++----------
> >   .../gpu/drm/i915/gt/intel_breadcrumbs_types.h |  2 +-
> >   drivers/gpu/drm/i915/i915_request.h           |  6 ++-
> >   3 files changed, 26 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > index 59e8cd505569..2b3ad17c63b9 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
> > @@ -175,32 +175,23 @@ static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl)
> >               intel_engine_add_retire(b->irq_engine, tl);
> >   }
> >   
> > -static bool __signal_request(struct i915_request *rq, struct list_head *signals)
> > -{
> > -     clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> > -
> > -     if (!__dma_fence_signal(&rq->fence))
> > -             return false;
> > -
> > -     list_add_tail(&rq->signal_link, signals);
> > -     return true;
> > -}
> > -
> >   static void signal_irq_work(struct irq_work *work)
> >   {
> >       struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work);
> >       const ktime_t timestamp = ktime_get();
> > +     struct llist_node *signal, *sn;
> >       struct intel_context *ce, *cn;
> >       struct list_head *pos, *next;
> > -     LIST_HEAD(signal);
> > +
> > +     signal = NULL;
> > +     if (!llist_empty(&b->signaled_requests))
> > +             signal = llist_del_all(&b->signaled_requests);
> 
> Uncoditional llist_del_all? It's not likely list will be empty and if it 
> is llist_del_all will return NULL.

Nah, this is likely to be empty, since this will only be filled after we
resubmit an already completed request. So the conditional is cheaper
than the locked xchg.

> >       spin_lock(&b->irq_lock);
> >   
> > -     if (b->irq_armed && list_empty(&b->signalers))
> > +     if (!signal && b->irq_armed && list_empty(&b->signalers))
> 
> Why !signal check in here? Couldn't figure out what changed to make this 
> needed.

Because we invoke signal_irq_work() after adding to
b->signaled_requests, I thought it was sensible to attempt keep the
interrupt shadow in place until after the following interrupt.
[Sadly we need to avoid the frequent enable/disable of the interrupts,
they are expensive enough to perform that it shows up in throughput
measurement. The question has become our _user_ interrupts now cheap
enough to always enable? The problem with snb dying under an interrupt
storm has been fixed, afaict we no longer burn through an entire core
handling interrupts when the GPU is busy.]

> >               __intel_breadcrumbs_disarm_irq(b);
> >   
> > -     list_splice_init(&b->signaled_requests, &signal);
> > -
> >       list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) {
> >               GEM_BUG_ON(list_empty(&ce->signals));
> >   
> > @@ -217,8 +208,13 @@ static void signal_irq_work(struct irq_work *work)
> >                        * spinlock as the callback chain may end up adding
> >                        * more signalers to the same context or engine.
> >                        */
> > -                     if (!__signal_request(rq, &signal))
> > +                     clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
> > +                     if (__dma_fence_signal(&rq->fence)) {
> > +                             rq->signal_node.next = signal;
> > +                             signal = &rq->signal_node;
> 
> Okay it creates a bit of a differently ordered list like this but I 
> think it doesn't matter.

Right. We change the order in which we signal, but I reasoned that we
are already signaling in a loose order as dma_fence_signal() is called
from many, many different paths without a care to timeline order.

Signaling the most recent first may improve latency in some cases, and
since the latency will be smaller for the most recent request that will
be a relatively large improvement vs the increase in latency for
handling the older request after the most recent. This is pure
speculation, I haven't measured the effect.

> > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> > index 16b721080195..874af6db6103 100644
> > --- a/drivers/gpu/drm/i915/i915_request.h
> > +++ b/drivers/gpu/drm/i915/i915_request.h
> > @@ -176,7 +176,11 @@ struct i915_request {
> >       struct intel_context *context;
> >       struct intel_ring *ring;
> >       struct intel_timeline __rcu *timeline;
> > -     struct list_head signal_link;
> > +
> > +     union {
> > +             struct list_head signal_link;
> > +             struct llist_node signal_node;
> 
> Transition is only from signal_link to signal_node, which uses and 
> initializes only one field, so I think potential garbage in other ones 
> is safe.

Yup, the signal_link was used to being garbage after signaling :)
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index 59e8cd505569..2b3ad17c63b9 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -175,32 +175,23 @@  static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl)
 		intel_engine_add_retire(b->irq_engine, tl);
 }
 
-static bool __signal_request(struct i915_request *rq, struct list_head *signals)
-{
-	clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
-
-	if (!__dma_fence_signal(&rq->fence))
-		return false;
-
-	list_add_tail(&rq->signal_link, signals);
-	return true;
-}
-
 static void signal_irq_work(struct irq_work *work)
 {
 	struct intel_breadcrumbs *b = container_of(work, typeof(*b), irq_work);
 	const ktime_t timestamp = ktime_get();
+	struct llist_node *signal, *sn;
 	struct intel_context *ce, *cn;
 	struct list_head *pos, *next;
-	LIST_HEAD(signal);
+
+	signal = NULL;
+	if (!llist_empty(&b->signaled_requests))
+		signal = llist_del_all(&b->signaled_requests);
 
 	spin_lock(&b->irq_lock);
 
-	if (b->irq_armed && list_empty(&b->signalers))
+	if (!signal && b->irq_armed && list_empty(&b->signalers))
 		__intel_breadcrumbs_disarm_irq(b);
 
-	list_splice_init(&b->signaled_requests, &signal);
-
 	list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) {
 		GEM_BUG_ON(list_empty(&ce->signals));
 
@@ -217,8 +208,13 @@  static void signal_irq_work(struct irq_work *work)
 			 * spinlock as the callback chain may end up adding
 			 * more signalers to the same context or engine.
 			 */
-			if (!__signal_request(rq, &signal))
+			clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
+			if (__dma_fence_signal(&rq->fence)) {
+				rq->signal_node.next = signal;
+				signal = &rq->signal_node;
+			} else {
 				i915_request_put(rq);
+			}
 		}
 
 		/*
@@ -238,9 +234,9 @@  static void signal_irq_work(struct irq_work *work)
 
 	spin_unlock(&b->irq_lock);
 
-	list_for_each_safe(pos, next, &signal) {
+	llist_for_each_safe(signal, sn, signal) {
 		struct i915_request *rq =
-			list_entry(pos, typeof(*rq), signal_link);
+			llist_entry(signal, typeof(*rq), signal_node);
 		struct list_head cb_list;
 
 		spin_lock(&rq->lock);
@@ -264,7 +260,7 @@  intel_breadcrumbs_create(struct intel_engine_cs *irq_engine)
 
 	spin_lock_init(&b->irq_lock);
 	INIT_LIST_HEAD(&b->signalers);
-	INIT_LIST_HEAD(&b->signaled_requests);
+	init_llist_head(&b->signaled_requests);
 
 	init_irq_work(&b->irq_work, signal_irq_work);
 
@@ -329,10 +325,12 @@  static void insert_breadcrumb(struct i915_request *rq,
 	 * its signal completion.
 	 */
 	if (__request_completed(rq)) {
-		if (__signal_request(rq, &b->signaled_requests))
-			irq_work_queue(&b->irq_work);
-		else
+		if (__dma_fence_signal(&rq->fence)) {
+			if (llist_add(&rq->signal_node, &b->signaled_requests))
+				irq_work_queue(&b->irq_work);
+		} else {
 			i915_request_put(rq);
+		}
 		return;
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
index 8e53b9942695..3fa19820b37a 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs_types.h
@@ -35,7 +35,7 @@  struct intel_breadcrumbs {
 	struct intel_engine_cs *irq_engine;
 
 	struct list_head signalers;
-	struct list_head signaled_requests;
+	struct llist_head signaled_requests;
 
 	struct irq_work irq_work; /* for use from inside irq_lock */
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 16b721080195..874af6db6103 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -176,7 +176,11 @@  struct i915_request {
 	struct intel_context *context;
 	struct intel_ring *ring;
 	struct intel_timeline __rcu *timeline;
-	struct list_head signal_link;
+
+	union {
+		struct list_head signal_link;
+		struct llist_node signal_node;
+	};
 
 	/*
 	 * The rcu epoch of when this request was allocated. Used to judiciously