diff mbox

[7/7] dma-buf/sw-sync: Use an rbtree to sort fences in the timeline

Message ID 20170629125930.821-7-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 29, 2017, 12:59 p.m. UTC
Reduce the list iteration when incrementing the timeline by storing the
fences in increasing order.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
---
 drivers/dma-buf/sw_sync.c    | 49 ++++++++++++++++++++++++++++++++++++++------
 drivers/dma-buf/sync_debug.h |  5 +++++
 2 files changed, 48 insertions(+), 6 deletions(-)

Comments

Sean Paul June 29, 2017, 6:10 p.m. UTC | #1
On Thu, Jun 29, 2017 at 01:59:30PM +0100, Chris Wilson wrote:
> Reduce the list iteration when incrementing the timeline by storing the
> fences in increasing order.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Sumit Semwal <sumit.semwal@linaro.org>
> Cc: Sean Paul <seanpaul@chromium.org>
> Cc: Gustavo Padovan <gustavo@padovan.org>
> ---
>  drivers/dma-buf/sw_sync.c    | 49 ++++++++++++++++++++++++++++++++++++++------
>  drivers/dma-buf/sync_debug.h |  5 +++++
>  2 files changed, 48 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
> index e51fe11bbbea..8cef5d345316 100644
> --- a/drivers/dma-buf/sw_sync.c
> +++ b/drivers/dma-buf/sw_sync.c
> @@ -96,6 +96,7 @@ static struct sync_timeline *sync_timeline_create(const char *name)
>  	obj->context = dma_fence_context_alloc(1);
>  	strlcpy(obj->name, name, sizeof(obj->name));
>  
> +	obj->pt_tree = RB_ROOT;
>  	INIT_LIST_HEAD(&obj->pt_list);
>  	spin_lock_init(&obj->lock);
>  
> @@ -142,9 +143,13 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc)
>  
>  	obj->value += inc;
>  
> -	list_for_each_entry_safe(pt, next, &obj->pt_list, link)
> -		if (dma_fence_is_signaled_locked(&pt->base))
> -			list_del_init(&pt->link);
> +	list_for_each_entry_safe(pt, next, &obj->pt_list, link) {
> +		if (!dma_fence_is_signaled_locked(&pt->base))
> +			break;
> +
> +		list_del_init(&pt->link);
> +		rb_erase(&pt->node, &obj->pt_tree);
> +	}
>  
>  	spin_unlock_irq(&obj->lock);
>  }
> @@ -174,8 +179,38 @@ static struct sync_pt *sync_pt_create(struct sync_timeline *obj,
>  	INIT_LIST_HEAD(&pt->link);
>  
>  	spin_lock_irq(&obj->lock);
> -	if (!dma_fence_is_signaled_locked(&pt->base))
> -		list_add_tail(&pt->link, &obj->pt_list);
> +	if (!dma_fence_is_signaled_locked(&pt->base)) {
> +		struct rb_node **p = &obj->pt_tree.rb_node;
> +		struct rb_node *parent = NULL;
> +
> +		while (*p) {
> +			struct sync_pt *other;
> +			int cmp;
> +
> +			parent = *p;
> +			other = rb_entry(parent, typeof(*pt), node);
> +			cmp = value - other->base.seqno;

We're imposing an implicit limitation on userspace here that points cannot be
> INT_MAX apart (since they'll be in the wrong order in the tree). 

I wonder how much this patch will actually save, given that the number of active points
on a given timeline should be small. Do we have any evidence that this
optimization is warranted?

Sean

> +			if (cmp > 0) {
> +				p = &parent->rb_right;
> +			} else if (cmp < 0) {
> +				p = &parent->rb_left;
> +			} else {
> +				if (dma_fence_get_rcu(&other->base)) {
> +					dma_fence_put(&pt->base);
> +					pt = other;
> +					goto unlock;
> +				}
> +				p = &parent->rb_left;
> +			}
> +		}
> +		rb_link_node(&pt->node, parent, p);
> +		rb_insert_color(&pt->node, &obj->pt_tree);
> +
> +		parent = rb_next(&pt->node);
> +		list_add_tail(&pt->link,
> +			      parent ? &rb_entry(parent, typeof(*pt), node)->link : &obj->pt_list);
> +	}
> +unlock:
>  	spin_unlock_irq(&obj->lock);
>  
>  	return pt;
> @@ -201,8 +236,10 @@ static void timeline_fence_release(struct dma_fence *fence)
>  
>  	spin_lock_irqsave(fence->lock, flags);
>  
> -	if (!list_empty(&pt->link))
> +	if (!list_empty(&pt->link)) {
>  		list_del(&pt->link);
> +		rb_erase(&pt->node, &parent->pt_tree);
> +	}
>  
>  	spin_unlock_irqrestore(fence->lock, flags);
>  
> diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h
> index 899ba0e19fd3..b7a5fab12179 100644
> --- a/drivers/dma-buf/sync_debug.h
> +++ b/drivers/dma-buf/sync_debug.h
> @@ -14,6 +14,7 @@
>  #define _LINUX_SYNC_H
>  
>  #include <linux/list.h>
> +#include <linux/rbtree.h>
>  #include <linux/spinlock.h>
>  #include <linux/dma-fence.h>
>  
> @@ -25,6 +26,7 @@
>   * @kref:		reference count on fence.
>   * @name:		name of the sync_timeline. Useful for debugging
>   * @lock:		lock protecting @child_list_head and fence.status
> + * @pt_tree:		rbtree of active (unsignaled/errored) sync_pts
>   * @pt_list:		list of active (unsignaled/errored) sync_pts
>   * @sync_timeline_list:	membership in global sync_timeline_list
>   */
> @@ -36,6 +38,7 @@ struct sync_timeline {
>  	u64			context;
>  	int			value;
>  
> +	struct rb_root		pt_tree;
>  	struct list_head	pt_list;
>  	spinlock_t		lock;
>  
> @@ -51,10 +54,12 @@ static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence)
>   * struct sync_pt - sync_pt object
>   * @base: base fence object
>   * @link: link on the sync timeline's list
> + * @node: node in the sync timeline's tree
>   */
>  struct sync_pt {
>  	struct dma_fence base;
>  	struct list_head link;
> +	struct rb_node node;
>  };
>  
>  #ifdef CONFIG_SW_SYNC
> -- 
> 2.13.1
Chris Wilson June 29, 2017, 6:29 p.m. UTC | #2
Quoting Sean Paul (2017-06-29 19:10:11)
> On Thu, Jun 29, 2017 at 01:59:30PM +0100, Chris Wilson wrote:
> > Reduce the list iteration when incrementing the timeline by storing the
> > fences in increasing order.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Sumit Semwal <sumit.semwal@linaro.org>
> > Cc: Sean Paul <seanpaul@chromium.org>
> > Cc: Gustavo Padovan <gustavo@padovan.org>
> > ---
> >  drivers/dma-buf/sw_sync.c    | 49 ++++++++++++++++++++++++++++++++++++++------
> >  drivers/dma-buf/sync_debug.h |  5 +++++
> >  2 files changed, 48 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
> > index e51fe11bbbea..8cef5d345316 100644
> > --- a/drivers/dma-buf/sw_sync.c
> > +++ b/drivers/dma-buf/sw_sync.c
> > @@ -96,6 +96,7 @@ static struct sync_timeline *sync_timeline_create(const char *name)
> >       obj->context = dma_fence_context_alloc(1);
> >       strlcpy(obj->name, name, sizeof(obj->name));
> >  
> > +     obj->pt_tree = RB_ROOT;
> >       INIT_LIST_HEAD(&obj->pt_list);
> >       spin_lock_init(&obj->lock);
> >  
> > @@ -142,9 +143,13 @@ static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc)
> >  
> >       obj->value += inc;
> >  
> > -     list_for_each_entry_safe(pt, next, &obj->pt_list, link)
> > -             if (dma_fence_is_signaled_locked(&pt->base))
> > -                     list_del_init(&pt->link);
> > +     list_for_each_entry_safe(pt, next, &obj->pt_list, link) {
> > +             if (!dma_fence_is_signaled_locked(&pt->base))
> > +                     break;
> > +
> > +             list_del_init(&pt->link);
> > +             rb_erase(&pt->node, &obj->pt_tree);
> > +     }
> >  
> >       spin_unlock_irq(&obj->lock);
> >  }
> > @@ -174,8 +179,38 @@ static struct sync_pt *sync_pt_create(struct sync_timeline *obj,
> >       INIT_LIST_HEAD(&pt->link);
> >  
> >       spin_lock_irq(&obj->lock);
> > -     if (!dma_fence_is_signaled_locked(&pt->base))
> > -             list_add_tail(&pt->link, &obj->pt_list);
> > +     if (!dma_fence_is_signaled_locked(&pt->base)) {
> > +             struct rb_node **p = &obj->pt_tree.rb_node;
> > +             struct rb_node *parent = NULL;
> > +
> > +             while (*p) {
> > +                     struct sync_pt *other;
> > +                     int cmp;
> > +
> > +                     parent = *p;
> > +                     other = rb_entry(parent, typeof(*pt), node);
> > +                     cmp = value - other->base.seqno;
> 
> We're imposing an implicit limitation on userspace here that points cannot be
> > INT_MAX apart (since they'll be in the wrong order in the tree). 

That's not a new limitation. It's an artifact of using the u32 timeline/seqno.

> I wonder how much this patch will actually save, given that the number of active points
> on a given timeline should be small. Do we have any evidence that this
> optimization is warranted?

The good news is that for empty/small trees, the overhead is tiny, less
than the cost of the syscall. I honestly don't know who uses sw_sync and
so would benefit. I figured I would throw it out here since it was
trivial.
-Chris
diff mbox

Patch

diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c
index e51fe11bbbea..8cef5d345316 100644
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -96,6 +96,7 @@  static struct sync_timeline *sync_timeline_create(const char *name)
 	obj->context = dma_fence_context_alloc(1);
 	strlcpy(obj->name, name, sizeof(obj->name));
 
+	obj->pt_tree = RB_ROOT;
 	INIT_LIST_HEAD(&obj->pt_list);
 	spin_lock_init(&obj->lock);
 
@@ -142,9 +143,13 @@  static void sync_timeline_signal(struct sync_timeline *obj, unsigned int inc)
 
 	obj->value += inc;
 
-	list_for_each_entry_safe(pt, next, &obj->pt_list, link)
-		if (dma_fence_is_signaled_locked(&pt->base))
-			list_del_init(&pt->link);
+	list_for_each_entry_safe(pt, next, &obj->pt_list, link) {
+		if (!dma_fence_is_signaled_locked(&pt->base))
+			break;
+
+		list_del_init(&pt->link);
+		rb_erase(&pt->node, &obj->pt_tree);
+	}
 
 	spin_unlock_irq(&obj->lock);
 }
@@ -174,8 +179,38 @@  static struct sync_pt *sync_pt_create(struct sync_timeline *obj,
 	INIT_LIST_HEAD(&pt->link);
 
 	spin_lock_irq(&obj->lock);
-	if (!dma_fence_is_signaled_locked(&pt->base))
-		list_add_tail(&pt->link, &obj->pt_list);
+	if (!dma_fence_is_signaled_locked(&pt->base)) {
+		struct rb_node **p = &obj->pt_tree.rb_node;
+		struct rb_node *parent = NULL;
+
+		while (*p) {
+			struct sync_pt *other;
+			int cmp;
+
+			parent = *p;
+			other = rb_entry(parent, typeof(*pt), node);
+			cmp = value - other->base.seqno;
+			if (cmp > 0) {
+				p = &parent->rb_right;
+			} else if (cmp < 0) {
+				p = &parent->rb_left;
+			} else {
+				if (dma_fence_get_rcu(&other->base)) {
+					dma_fence_put(&pt->base);
+					pt = other;
+					goto unlock;
+				}
+				p = &parent->rb_left;
+			}
+		}
+		rb_link_node(&pt->node, parent, p);
+		rb_insert_color(&pt->node, &obj->pt_tree);
+
+		parent = rb_next(&pt->node);
+		list_add_tail(&pt->link,
+			      parent ? &rb_entry(parent, typeof(*pt), node)->link : &obj->pt_list);
+	}
+unlock:
 	spin_unlock_irq(&obj->lock);
 
 	return pt;
@@ -201,8 +236,10 @@  static void timeline_fence_release(struct dma_fence *fence)
 
 	spin_lock_irqsave(fence->lock, flags);
 
-	if (!list_empty(&pt->link))
+	if (!list_empty(&pt->link)) {
 		list_del(&pt->link);
+		rb_erase(&pt->node, &parent->pt_tree);
+	}
 
 	spin_unlock_irqrestore(fence->lock, flags);
 
diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h
index 899ba0e19fd3..b7a5fab12179 100644
--- a/drivers/dma-buf/sync_debug.h
+++ b/drivers/dma-buf/sync_debug.h
@@ -14,6 +14,7 @@ 
 #define _LINUX_SYNC_H
 
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/dma-fence.h>
 
@@ -25,6 +26,7 @@ 
  * @kref:		reference count on fence.
  * @name:		name of the sync_timeline. Useful for debugging
  * @lock:		lock protecting @child_list_head and fence.status
+ * @pt_tree:		rbtree of active (unsignaled/errored) sync_pts
  * @pt_list:		list of active (unsignaled/errored) sync_pts
  * @sync_timeline_list:	membership in global sync_timeline_list
  */
@@ -36,6 +38,7 @@  struct sync_timeline {
 	u64			context;
 	int			value;
 
+	struct rb_root		pt_tree;
 	struct list_head	pt_list;
 	spinlock_t		lock;
 
@@ -51,10 +54,12 @@  static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence)
  * struct sync_pt - sync_pt object
  * @base: base fence object
  * @link: link on the sync timeline's list
+ * @node: node in the sync timeline's tree
  */
 struct sync_pt {
 	struct dma_fence base;
 	struct list_head link;
+	struct rb_node node;
 };
 
 #ifdef CONFIG_SW_SYNC