diff mbox

dma-buf: Refanctor reservation_object_add_shared_fence()

Message ID 20180130093235.2859-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Jan. 30, 2018, 9:32 a.m. UTC
Adding a shared fence to a reservation_object is currently split into
two handlers, one to insert the fence into the existing array and the
other to replace the existing array with a new larger array. The first
step in both of these routines involves scanning the existing array to
decide if it can prune any of the existing fences.

As both routines perform essentially the same loop, we can combine them
into one routine. During the first scan over the existing array, we
search for a slot with which we can reuse for the new fence (discarding
the previous). If we find no available slot to reuse, and the array is
already at its max size, then we know that we need to switch to the
larger array, and that no existing fence needs to be discard, they can
all be copied over.

add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-627 (-627)
Function                                     old     new   delta
__warned                                    2352    2350      -2
reservation_object_reserve_shared            196     185     -11
reservation_object_add_shared_fence         1272     658    -614

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/dma-buf/reservation.c | 178 ++++++++++++++----------------------------
 1 file changed, 60 insertions(+), 118 deletions(-)

Comments

Christian König Jan. 30, 2018, 12:26 p.m. UTC | #1
Am 30.01.2018 um 10:32 schrieb Chris Wilson:
> Adding a shared fence to a reservation_object is currently split into
> two handlers, one to insert the fence into the existing array and the
> other to replace the existing array with a new larger array. The first
> step in both of these routines involves scanning the existing array to
> decide if it can prune any of the existing fences.
>
> As both routines perform essentially the same loop, we can combine them
> into one routine. During the first scan over the existing array, we
> search for a slot with which we can reuse for the new fence (discarding
> the previous). If we find no available slot to reuse, and the array is
> already at its max size, then we know that we need to switch to the
> larger array, and that no existing fence needs to be discard, they can
> all be copied over.
>
> add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-627 (-627)
> Function                                     old     new   delta
> __warned                                    2352    2350      -2
> reservation_object_reserve_shared            196     185     -11
> reservation_object_add_shared_fence         1272     658    -614

Looks good to me, but still two comments:

1. It looks like we now can have the same context multiple times in an 
reservation object. Could we avoid that?

2. When the we copied the fences over into the new container we 
previously removed all signaled one. That had the really nice side 
effect of handling them during command submission quite a bit faster. 
Can we keep that as well?

Regards,
Christian.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Alex Deucher <alexander.deucher@amd.com>
> ---
>   drivers/dma-buf/reservation.c | 178 ++++++++++++++----------------------------
>   1 file changed, 60 insertions(+), 118 deletions(-)
>
> diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
> index 314eb1071cce..de7b6e709a68 100644
> --- a/drivers/dma-buf/reservation.c
> +++ b/drivers/dma-buf/reservation.c
> @@ -76,8 +76,6 @@ int reservation_object_reserve_shared(struct reservation_object *obj)
>   	if (old && old->shared_max) {
>   		if (old->shared_count < old->shared_max) {
>   			/* perform an in-place update */
> -			kfree(obj->staged);
> -			obj->staged = NULL;
>   			return 0;
>   		} else
>   			max = old->shared_max * 2;
> @@ -95,146 +93,90 @@ int reservation_object_reserve_shared(struct reservation_object *obj)
>   
>   	obj->staged = fobj;
>   	fobj->shared_max = max;
> +	fobj->shared_count = 0;
>   	return 0;
>   }
>   EXPORT_SYMBOL(reservation_object_reserve_shared);
>   
> -static void
> -reservation_object_add_shared_inplace(struct reservation_object *obj,
> -				      struct reservation_object_list *fobj,
> -				      struct dma_fence *fence)
> +/**
> + * reservation_object_add_shared_fence - Add a fence to a shared slot
> + * @obj: the reservation object
> + * @fence: the shared fence to add
> + *
> + * Add a fence to a shared slot, obj->lock must be held, and
> + * reservation_object_reserve_shared() has been called.
> + */
> +void reservation_object_add_shared_fence(struct reservation_object *obj,
> +					 struct dma_fence *fence)
>   {
> -	struct dma_fence *signaled = NULL;
> -	u32 i, signaled_idx;
> +	struct reservation_object_list *old, *fobj;
> +	struct dma_fence *replace = NULL;
> +	u32 max = 0, i = 0;
>   
>   	dma_fence_get(fence);
>   
> -	preempt_disable();
> -	write_seqcount_begin(&obj->seq);
> -
> -	for (i = 0; i < fobj->shared_count; ++i) {
> -		struct dma_fence *old_fence;
> -
> -		old_fence = rcu_dereference_protected(fobj->shared[i],
> -						reservation_object_held(obj));
> -
> -		if (old_fence->context == fence->context) {
> -			/* memory barrier is added by write_seqcount_begin */
> -			RCU_INIT_POINTER(fobj->shared[i], fence);
> -			write_seqcount_end(&obj->seq);
> -			preempt_enable();
> -
> -			dma_fence_put(old_fence);
> -			return;
> -		}
> -
> -		if (!signaled && dma_fence_is_signaled(old_fence)) {
> -			signaled = old_fence;
> -			signaled_idx = i;
> -		}
> -	}
> -
>   	/*
> -	 * memory barrier is added by write_seqcount_begin,
> -	 * fobj->shared_count is protected by this lock too
> +	 * Find an existing slot we can overwrite with the new fence.
> +	 *
> +	 * It is assumed that fences are added sequentially, that a second
> +	 * fence from the same context as an existing fence is strictly later
> +	 * than the existing fence. Therefore any new fence on an existing
> +	 * context can simply replace that existing fence.
> +	 *
> +	 * Likewise, if a fence is already signaled it is no longer required
> +	 * as part of the reservation_object and can be replaced by the new
> +	 * fence. This allows us to gradually prune the shared fence array
> +	 * and prevent its excessive growth.
>   	 */
> -	if (signaled) {
> -		RCU_INIT_POINTER(fobj->shared[signaled_idx], fence);
> -	} else {
> -		RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
> -		fobj->shared_count++;
> -	}
> -
> -	write_seqcount_end(&obj->seq);
> -	preempt_enable();
> +	old = reservation_object_get_list(obj);
> +	if (old) {
> +		u32 ctx = fence->context;
>   
> -	dma_fence_put(signaled);
> -}
> +		for (; i < old->shared_count; ++i) {
> +			struct dma_fence *check;
>   
> -static void
> -reservation_object_add_shared_replace(struct reservation_object *obj,
> -				      struct reservation_object_list *old,
> -				      struct reservation_object_list *fobj,
> -				      struct dma_fence *fence)
> -{
> -	unsigned i, j, k;
> +			check = rcu_dereference_protected(old->shared[i],
> +							  reservation_object_held(obj));
>   
> -	dma_fence_get(fence);
> +			if (check->context == ctx ||
> +			    dma_fence_is_signaled(check)) {
> +				replace = check;
> +				break;
> +			}
> +		}
>   
> -	if (!old) {
> -		RCU_INIT_POINTER(fobj->shared[0], fence);
> -		fobj->shared_count = 1;
> -		goto done;
> +		fobj = old;
> +		max = old->shared_max;
>   	}
>   
> -	/*
> -	 * no need to bump fence refcounts, rcu_read access
> -	 * requires the use of kref_get_unless_zero, and the
> -	 * references from the old struct are carried over to
> -	 * the new.
> -	 */
> -	for (i = 0, j = 0, k = fobj->shared_max; i < old->shared_count; ++i) {
> -		struct dma_fence *check;
> -
> -		check = rcu_dereference_protected(old->shared[i],
> -						reservation_object_held(obj));
> +	if (i >= max) {
> +		fobj = obj->staged;
> +		obj->staged = NULL;
>   
> -		if (check->context == fence->context ||
> -		    dma_fence_is_signaled(check))
> -			RCU_INIT_POINTER(fobj->shared[--k], check);
> -		else
> -			RCU_INIT_POINTER(fobj->shared[j++], check);
> +		if (old) {
> +			/* Borrow the old fence references */
> +			memcpy(fobj->shared, old->shared,
> +			       old->shared_count * sizeof(*old->shared));
> +			fobj->shared_count = old->shared_count;
> +		}
>   	}
> -	fobj->shared_count = j;
> -	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
> -	fobj->shared_count++;
>   
> -done:
>   	preempt_disable();
>   	write_seqcount_begin(&obj->seq);
> -	/*
> -	 * RCU_INIT_POINTER can be used here,
> -	 * seqcount provides the necessary barriers
> -	 */
> -	RCU_INIT_POINTER(obj->fence, fobj);
> -	write_seqcount_end(&obj->seq);
> -	preempt_enable();
> -
> -	if (!old)
> -		return;
> -
> -	/* Drop the references to the signaled fences */
> -	for (i = k; i < fobj->shared_max; ++i) {
> -		struct dma_fence *f;
>   
> -		f = rcu_dereference_protected(fobj->shared[i],
> -					      reservation_object_held(obj));
> -		dma_fence_put(f);
> -	}
> -	kfree_rcu(old, rcu);
> -}
> -
> -/**
> - * reservation_object_add_shared_fence - Add a fence to a shared slot
> - * @obj: the reservation object
> - * @fence: the shared fence to add
> - *
> - * Add a fence to a shared slot, obj->lock must be held, and
> - * reservation_object_reserve_shared() has been called.
> - */
> -void reservation_object_add_shared_fence(struct reservation_object *obj,
> -					 struct dma_fence *fence)
> -{
> -	struct reservation_object_list *old, *fobj = obj->staged;
> +	RCU_INIT_POINTER(fobj->shared[i], fence);
> +	if (!replace)
> +		fobj->shared_count++;
> +	if (fobj != old)
> +		RCU_INIT_POINTER(obj->fence, fobj);
>   
> -	old = reservation_object_get_list(obj);
> -	obj->staged = NULL;
> +	/* write_seqcount_end() provides the necessary mb for the RCU writes */
> +	write_seqcount_end(&obj->seq);
> +	preempt_enable();
>   
> -	if (!fobj) {
> -		BUG_ON(old->shared_count >= old->shared_max);
> -		reservation_object_add_shared_inplace(obj, old, fence);
> -	} else
> -		reservation_object_add_shared_replace(obj, old, fobj, fence);
> +	dma_fence_put(replace);
> +	if (old && fobj != old)
> +		kfree_rcu(old, rcu);
>   }
>   EXPORT_SYMBOL(reservation_object_add_shared_fence);
>
Chris Wilson Jan. 30, 2018, 12:50 p.m. UTC | #2
Quoting Christian König (2018-01-30 12:26:05)
> Am 30.01.2018 um 10:32 schrieb Chris Wilson:
> > Adding a shared fence to a reservation_object is currently split into
> > two handlers, one to insert the fence into the existing array and the
> > other to replace the existing array with a new larger array. The first
> > step in both of these routines involves scanning the existing array to
> > decide if it can prune any of the existing fences.
> >
> > As both routines perform essentially the same loop, we can combine them
> > into one routine. During the first scan over the existing array, we
> > search for a slot with which we can reuse for the new fence (discarding
> > the previous). If we find no available slot to reuse, and the array is
> > already at its max size, then we know that we need to switch to the
> > larger array, and that no existing fence needs to be discard, they can
> > all be copied over.
> >
> > add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-627 (-627)
> > Function                                     old     new   delta
> > __warned                                    2352    2350      -2
> > reservation_object_reserve_shared            196     185     -11
> > reservation_object_add_shared_fence         1272     658    -614
> 
> Looks good to me, but still two comments:
> 
> 1. It looks like we now can have the same context multiple times in an 
> reservation object. Could we avoid that?

Not without scanning the entire array. Multiple fences from the same
context isn't an issue for either of us as we both reduce down to the
last fence on each context, and in the case of iteratively waiting on
each fence in the reservation_object it is not noticeable.

The trade-off is in choosing to scan the entire array.

> 2. When the we copied the fences over into the new container we 
> previously removed all signaled one. That had the really nice side 
> effect of handling them during command submission quite a bit faster. 
> Can we keep that as well?

We prove that before we copy to a new array, there are no currently
signaled fences left.

Ime, when processing the fences you generally have to discard the signaled
fences anyway, so the benefit for add_shared_fence is in pruning any
signaled fences to prevent realloc; and I prefer the more gradual
approach for the trade-off in reducing the work on adding the fence.

The alternative is that we basically run the replace algorithm and
decide to the final op inplace (and not consume obj->staged) if we only
need to replace/add one fence.
-Chris
Christian König Jan. 30, 2018, 1:01 p.m. UTC | #3
Am 30.01.2018 um 13:50 schrieb Chris Wilson:
> Quoting Christian König (2018-01-30 12:26:05)
>> Am 30.01.2018 um 10:32 schrieb Chris Wilson:
>>> Adding a shared fence to a reservation_object is currently split into
>>> two handlers, one to insert the fence into the existing array and the
>>> other to replace the existing array with a new larger array. The first
>>> step in both of these routines involves scanning the existing array to
>>> decide if it can prune any of the existing fences.
>>>
>>> As both routines perform essentially the same loop, we can combine them
>>> into one routine. During the first scan over the existing array, we
>>> search for a slot with which we can reuse for the new fence (discarding
>>> the previous). If we find no available slot to reuse, and the array is
>>> already at its max size, then we know that we need to switch to the
>>> larger array, and that no existing fence needs to be discard, they can
>>> all be copied over.
>>>
>>> add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-627 (-627)
>>> Function                                     old     new   delta
>>> __warned                                    2352    2350      -2
>>> reservation_object_reserve_shared            196     185     -11
>>> reservation_object_add_shared_fence         1272     658    -614
>> Looks good to me, but still two comments:
>>
>> 1. It looks like we now can have the same context multiple times in an
>> reservation object. Could we avoid that?
> Not without scanning the entire array. Multiple fences from the same
> context isn't an issue for either of us as we both reduce down to the
> last fence on each context, and in the case of iteratively waiting on
> each fence in the reservation_object it is not noticeable.
>
> The trade-off is in choosing to scan the entire array.

So the only extra overhead is when scanning the fences during command 
submission and there it doesn't really matter if we need to handle an 
already signaled one or multiple signaled one from the same context.

Well, that is a good point.

>
>> 2. When the we copied the fences over into the new container we
>> previously removed all signaled one. That had the really nice side
>> effect of handling them during command submission quite a bit faster.
>> Can we keep that as well?
> We prove that before we copy to a new array, there are no currently
> signaled fences left.

Ah, yeah missed that. Ok in this case the patch is Reviewed-by: 
Christian König <christian.koenig@amd.com>.

Regards,
Christian.

>
> Ime, when processing the fences you generally have to discard the signaled
> fences anyway, so the benefit for add_shared_fence is in pruning any
> signaled fences to prevent realloc; and I prefer the more gradual
> approach for the trade-off in reducing the work on adding the fence.
>
> The alternative is that we basically run the replace algorithm and
> decide to the final op inplace (and not consume obj->staged) if we only
> need to replace/add one fence.
> -Chris
diff mbox

Patch

diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c
index 314eb1071cce..de7b6e709a68 100644
--- a/drivers/dma-buf/reservation.c
+++ b/drivers/dma-buf/reservation.c
@@ -76,8 +76,6 @@  int reservation_object_reserve_shared(struct reservation_object *obj)
 	if (old && old->shared_max) {
 		if (old->shared_count < old->shared_max) {
 			/* perform an in-place update */
-			kfree(obj->staged);
-			obj->staged = NULL;
 			return 0;
 		} else
 			max = old->shared_max * 2;
@@ -95,146 +93,90 @@  int reservation_object_reserve_shared(struct reservation_object *obj)
 
 	obj->staged = fobj;
 	fobj->shared_max = max;
+	fobj->shared_count = 0;
 	return 0;
 }
 EXPORT_SYMBOL(reservation_object_reserve_shared);
 
-static void
-reservation_object_add_shared_inplace(struct reservation_object *obj,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
+/**
+ * reservation_object_add_shared_fence - Add a fence to a shared slot
+ * @obj: the reservation object
+ * @fence: the shared fence to add
+ *
+ * Add a fence to a shared slot, obj->lock must be held, and
+ * reservation_object_reserve_shared() has been called.
+ */
+void reservation_object_add_shared_fence(struct reservation_object *obj,
+					 struct dma_fence *fence)
 {
-	struct dma_fence *signaled = NULL;
-	u32 i, signaled_idx;
+	struct reservation_object_list *old, *fobj;
+	struct dma_fence *replace = NULL;
+	u32 max = 0, i = 0;
 
 	dma_fence_get(fence);
 
-	preempt_disable();
-	write_seqcount_begin(&obj->seq);
-
-	for (i = 0; i < fobj->shared_count; ++i) {
-		struct dma_fence *old_fence;
-
-		old_fence = rcu_dereference_protected(fobj->shared[i],
-						reservation_object_held(obj));
-
-		if (old_fence->context == fence->context) {
-			/* memory barrier is added by write_seqcount_begin */
-			RCU_INIT_POINTER(fobj->shared[i], fence);
-			write_seqcount_end(&obj->seq);
-			preempt_enable();
-
-			dma_fence_put(old_fence);
-			return;
-		}
-
-		if (!signaled && dma_fence_is_signaled(old_fence)) {
-			signaled = old_fence;
-			signaled_idx = i;
-		}
-	}
-
 	/*
-	 * memory barrier is added by write_seqcount_begin,
-	 * fobj->shared_count is protected by this lock too
+	 * Find an existing slot we can overwrite with the new fence.
+	 *
+	 * It is assumed that fences are added sequentially, that a second
+	 * fence from the same context as an existing fence is strictly later
+	 * than the existing fence. Therefore any new fence on an existing
+	 * context can simply replace that existing fence.
+	 *
+	 * Likewise, if a fence is already signaled it is no longer required
+	 * as part of the reservation_object and can be replaced by the new
+	 * fence. This allows us to gradually prune the shared fence array
+	 * and prevent its excessive growth.
 	 */
-	if (signaled) {
-		RCU_INIT_POINTER(fobj->shared[signaled_idx], fence);
-	} else {
-		RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-		fobj->shared_count++;
-	}
-
-	write_seqcount_end(&obj->seq);
-	preempt_enable();
+	old = reservation_object_get_list(obj);
+	if (old) {
+		u32 ctx = fence->context;
 
-	dma_fence_put(signaled);
-}
+		for (; i < old->shared_count; ++i) {
+			struct dma_fence *check;
 
-static void
-reservation_object_add_shared_replace(struct reservation_object *obj,
-				      struct reservation_object_list *old,
-				      struct reservation_object_list *fobj,
-				      struct dma_fence *fence)
-{
-	unsigned i, j, k;
+			check = rcu_dereference_protected(old->shared[i],
+							  reservation_object_held(obj));
 
-	dma_fence_get(fence);
+			if (check->context == ctx ||
+			    dma_fence_is_signaled(check)) {
+				replace = check;
+				break;
+			}
+		}
 
-	if (!old) {
-		RCU_INIT_POINTER(fobj->shared[0], fence);
-		fobj->shared_count = 1;
-		goto done;
+		fobj = old;
+		max = old->shared_max;
 	}
 
-	/*
-	 * no need to bump fence refcounts, rcu_read access
-	 * requires the use of kref_get_unless_zero, and the
-	 * references from the old struct are carried over to
-	 * the new.
-	 */
-	for (i = 0, j = 0, k = fobj->shared_max; i < old->shared_count; ++i) {
-		struct dma_fence *check;
-
-		check = rcu_dereference_protected(old->shared[i],
-						reservation_object_held(obj));
+	if (i >= max) {
+		fobj = obj->staged;
+		obj->staged = NULL;
 
-		if (check->context == fence->context ||
-		    dma_fence_is_signaled(check))
-			RCU_INIT_POINTER(fobj->shared[--k], check);
-		else
-			RCU_INIT_POINTER(fobj->shared[j++], check);
+		if (old) {
+			/* Borrow the old fence references */
+			memcpy(fobj->shared, old->shared,
+			       old->shared_count * sizeof(*old->shared));
+			fobj->shared_count = old->shared_count;
+		}
 	}
-	fobj->shared_count = j;
-	RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence);
-	fobj->shared_count++;
 
-done:
 	preempt_disable();
 	write_seqcount_begin(&obj->seq);
-	/*
-	 * RCU_INIT_POINTER can be used here,
-	 * seqcount provides the necessary barriers
-	 */
-	RCU_INIT_POINTER(obj->fence, fobj);
-	write_seqcount_end(&obj->seq);
-	preempt_enable();
-
-	if (!old)
-		return;
-
-	/* Drop the references to the signaled fences */
-	for (i = k; i < fobj->shared_max; ++i) {
-		struct dma_fence *f;
 
-		f = rcu_dereference_protected(fobj->shared[i],
-					      reservation_object_held(obj));
-		dma_fence_put(f);
-	}
-	kfree_rcu(old, rcu);
-}
-
-/**
- * reservation_object_add_shared_fence - Add a fence to a shared slot
- * @obj: the reservation object
- * @fence: the shared fence to add
- *
- * Add a fence to a shared slot, obj->lock must be held, and
- * reservation_object_reserve_shared() has been called.
- */
-void reservation_object_add_shared_fence(struct reservation_object *obj,
-					 struct dma_fence *fence)
-{
-	struct reservation_object_list *old, *fobj = obj->staged;
+	RCU_INIT_POINTER(fobj->shared[i], fence);
+	if (!replace)
+		fobj->shared_count++;
+	if (fobj != old)
+		RCU_INIT_POINTER(obj->fence, fobj);
 
-	old = reservation_object_get_list(obj);
-	obj->staged = NULL;
+	/* write_seqcount_end() provides the necessary mb for the RCU writes */
+	write_seqcount_end(&obj->seq);
+	preempt_enable();
 
-	if (!fobj) {
-		BUG_ON(old->shared_count >= old->shared_max);
-		reservation_object_add_shared_inplace(obj, old, fence);
-	} else
-		reservation_object_add_shared_replace(obj, old, fobj, fence);
+	dma_fence_put(replace);
+	if (old && fobj != old)
+		kfree_rcu(old, rcu);
 }
 EXPORT_SYMBOL(reservation_object_add_shared_fence);