diff mbox

[6/6] drm/i915: Store the vma in an rbtree under the object

Message ID 20161031102645.29495-6-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Oct. 31, 2016, 10:26 a.m. UTC
With full-ppgtt one of the main bottlenecks is the lookup of the VMA
underneath the object. For execbuf there is merit in having a very fast
direct lookup of ctx:handle to the vma using a hashtree, but that still
leaves a large number of other lookups. One way to speed up the lookup
would be to use a rhashtable, but that requires extra allocations and
may exhibit poor worse case behaviour. An alternative is to use an
embedded rbtree, i.e. no extra allocations and deterministic behaviour,
but at the slight cost of O(lgN) lookups (instead of O(1) for
rhashtable). The major of such tree will be very shallow and so not much
slower, and still scales much, much better than the current unsorted
list.

References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h     |  1 +
 drivers/gpu/drm/i915/i915_gem_gtt.c | 80 +++++++++++++++++++++++++------------
 drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
 3 files changed, 57 insertions(+), 25 deletions(-)

Comments

Tvrtko Ursulin Nov. 1, 2016, 8:41 a.m. UTC | #1
On 31/10/2016 10:26, Chris Wilson wrote:
> With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> underneath the object. For execbuf there is merit in having a very fast
> direct lookup of ctx:handle to the vma using a hashtree, but that still
> leaves a large number of other lookups. One way to speed up the lookup
> would be to use a rhashtable, but that requires extra allocations and
> may exhibit poor worse case behaviour. An alternative is to use an
> embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> but at the slight cost of O(lgN) lookups (instead of O(1) for
> rhashtable). The major of such tree will be very shallow and so not much
> slower, and still scales much, much better than the current unsorted
> list.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

I suggest leaving this out of the mini-series which fixes the recently 
introduced bugs.

Regards,

Tvrtko


> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  1 +
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 80 +++++++++++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
>  3 files changed, 57 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7a18bf66f797..e923d6596cac 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2230,6 +2230,7 @@ struct drm_i915_gem_object {
>
>  	/** List of VMAs backed by this object */
>  	struct list_head vma_list;
> +	struct rb_root vma_tree;
>
>  	/** Stolen memory for this object, instead of being backed by shmem. */
>  	struct drm_mm_node *stolen;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index e7afad585929..aa2d21c41091 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -3399,6 +3399,7 @@ void i915_vma_destroy(struct i915_vma *vma)
>  	GEM_BUG_ON(!i915_vma_is_closed(vma));
>  	GEM_BUG_ON(vma->fence);
>
> +	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
>  	list_del(&vma->vm_link);
>  	if (!i915_vma_is_ggtt(vma))
>  		i915_ppgtt_put(i915_vm_to_ppgtt(vma->vm));
> @@ -3416,12 +3417,33 @@ void i915_vma_close(struct i915_vma *vma)
>  		WARN_ON(i915_vma_unbind(vma));
>  }
>
> +static inline int vma_compare(struct i915_vma *vma,
> +			      struct i915_address_space *vm,
> +			      const struct i915_ggtt_view *view)
> +{
> +	GEM_BUG_ON(view && !i915_vma_is_ggtt(vma));
> +
> +	if (vma->vm != vm)
> +		return vma->vm - vm;
> +
> +	if (!view)
> +		return vma->ggtt_view.type;
> +
> +	if (vma->ggtt_view.type != view->type)
> +		return vma->ggtt_view.type - view->type;
> +
> +	return memcmp(&vma->ggtt_view.params,
> +		      &view->params,
> +		      sizeof(view->params));
> +}
> +
>  static struct i915_vma *
>  __i915_vma_create(struct drm_i915_gem_object *obj,
>  		  struct i915_address_space *vm,
>  		  const struct i915_ggtt_view *view)
>  {
>  	struct i915_vma *vma;
> +	struct rb_node *rb, **p;
>  	int i;
>
>  	GEM_BUG_ON(vm->closed);
> @@ -3455,33 +3477,28 @@ __i915_vma_create(struct drm_i915_gem_object *obj,
>
>  	if (i915_is_ggtt(vm)) {
>  		vma->flags |= I915_VMA_GGTT;
> +		list_add(&vma->obj_link, &obj->vma_list);
>  	} else {
>  		i915_ppgtt_get(i915_vm_to_ppgtt(vm));
> +		list_add_tail(&vma->obj_link, &obj->vma_list);
>  	}
>
> -	list_add_tail(&vma->obj_link, &obj->vma_list);
> -	return vma;
> -}
> +	rb = NULL;
> +	p = &obj->vma_tree.rb_node;
> +	while (*p) {
> +		struct i915_vma *pos;
>
> -static inline bool vma_matches(struct i915_vma *vma,
> -			       struct i915_address_space *vm,
> -			       const struct i915_ggtt_view *view)
> -{
> -	if (vma->vm != vm)
> -		return false;
> -
> -	if (!i915_vma_is_ggtt(vma))
> -		return true;
> -
> -	if (!view)
> -		return vma->ggtt_view.type == 0;
> -
> -	if (vma->ggtt_view.type != view->type)
> -		return false;
> +		rb = *p;
> +		pos = rb_entry(rb, struct i915_vma, obj_node);
> +		if (vma_compare(pos, vm, view) < 0)
> +			p = &rb->rb_right;
> +		else
> +			p = &rb->rb_left;
> +	}
> +	rb_link_node(&vma->obj_node, rb, p);
> +	rb_insert_color(&vma->obj_node, &obj->vma_tree);
>
> -	return memcmp(&vma->ggtt_view.params,
> -		      &view->params,
> -		      sizeof(view->params)) == 0;
> +	return vma;
>  }
>
>  struct i915_vma *
> @@ -3501,11 +3518,22 @@ i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
>  		    struct i915_address_space *vm,
>  		    const struct i915_ggtt_view *view)
>  {
> -	struct i915_vma *vma;
> +	struct rb_node *rb;
> +
> +	rb = obj->vma_tree.rb_node;
> +	while (rb) {
> +		struct i915_vma *vma;
> +		int cmp;
>
> -	list_for_each_entry_reverse(vma, &obj->vma_list, obj_link)
> -		if (vma_matches(vma, vm, view))
> +		vma = rb_entry(rb, struct i915_vma, obj_node);
> +		cmp = vma_compare(vma, vm, view);
> +		if (cmp == 0)
>  			return vma;
> +		else if (cmp < 0)
> +			rb = rb->rb_right;
> +		else
> +			rb = rb->rb_left;
> +	}
>
>  	return NULL;
>  }
> @@ -3521,8 +3549,10 @@ i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
>  	GEM_BUG_ON(view && !i915_is_ggtt(vm));
>
>  	vma = i915_gem_obj_to_vma(obj, vm, view);
> -	if (!vma)
> +	if (!vma) {
>  		vma = __i915_vma_create(obj, vm, view);
> +		GEM_BUG_ON(vma != i915_gem_obj_to_vma(obj, vm, view));
> +	}
>
>  	GEM_BUG_ON(i915_vma_is_closed(vma));
>  	return vma;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 518e75b64290..c23ef9db1f53 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -227,6 +227,7 @@ struct i915_vma {
>  	struct list_head vm_link;
>
>  	struct list_head obj_link; /* Link in the object's VMA list */
> +	struct rb_node obj_node;
>
>  	/** This vma's place in the batchbuffer or on the eviction list */
>  	struct list_head exec_list;
>
Chris Wilson Nov. 1, 2016, 8:50 a.m. UTC | #2
On Tue, Nov 01, 2016 at 08:41:01AM +0000, Tvrtko Ursulin wrote:
> 
> On 31/10/2016 10:26, Chris Wilson wrote:
> >With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> >underneath the object. For execbuf there is merit in having a very fast
> >direct lookup of ctx:handle to the vma using a hashtree, but that still
> >leaves a large number of other lookups. One way to speed up the lookup
> >would be to use a rhashtable, but that requires extra allocations and
> >may exhibit poor worse case behaviour. An alternative is to use an
> >embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> >but at the slight cost of O(lgN) lookups (instead of O(1) for
> >rhashtable). The major of such tree will be very shallow and so not much
> >slower, and still scales much, much better than the current unsorted
> >list.
> >
> >References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> I suggest leaving this out of the mini-series which fixes the
> recently introduced bugs.

Just review it now, it's a two year old bug that was impacting the test
cases I was running... :-p Then we won't have to review it next week ;)
-Chris
Tvrtko Ursulin Nov. 1, 2016, 8:54 a.m. UTC | #3
On 01/11/2016 08:50, Chris Wilson wrote:
> On Tue, Nov 01, 2016 at 08:41:01AM +0000, Tvrtko Ursulin wrote:
>>
>> On 31/10/2016 10:26, Chris Wilson wrote:
>>> With full-ppgtt one of the main bottlenecks is the lookup of the VMA
>>> underneath the object. For execbuf there is merit in having a very fast
>>> direct lookup of ctx:handle to the vma using a hashtree, but that still
>>> leaves a large number of other lookups. One way to speed up the lookup
>>> would be to use a rhashtable, but that requires extra allocations and
>>> may exhibit poor worse case behaviour. An alternative is to use an
>>> embedded rbtree, i.e. no extra allocations and deterministic behaviour,
>>> but at the slight cost of O(lgN) lookups (instead of O(1) for
>>> rhashtable). The major of such tree will be very shallow and so not much
>>> slower, and still scales much, much better than the current unsorted
>>> list.
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> I suggest leaving this out of the mini-series which fixes the
>> recently introduced bugs.
>
> Just review it now, it's a two year old bug that was impacting the test
> cases I was running... :-p Then we won't have to review it next week ;)

I'll review it, just split it out please so the CI fire can be 
extinguished first.

Regards,

Tvrtko
Chris Wilson Nov. 1, 2016, 9:06 a.m. UTC | #4
On Tue, Nov 01, 2016 at 08:54:09AM +0000, Tvrtko Ursulin wrote:
> 
> On 01/11/2016 08:50, Chris Wilson wrote:
> >On Tue, Nov 01, 2016 at 08:41:01AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 31/10/2016 10:26, Chris Wilson wrote:
> >>>With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> >>>underneath the object. For execbuf there is merit in having a very fast
> >>>direct lookup of ctx:handle to the vma using a hashtree, but that still
> >>>leaves a large number of other lookups. One way to speed up the lookup
> >>>would be to use a rhashtable, but that requires extra allocations and
> >>>may exhibit poor worse case behaviour. An alternative is to use an
> >>>embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> >>>but at the slight cost of O(lgN) lookups (instead of O(1) for
> >>>rhashtable). The major of such tree will be very shallow and so not much
> >>>slower, and still scales much, much better than the current unsorted
> >>>list.
> >>>
> >>>References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> >>>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>
> >>I suggest leaving this out of the mini-series which fixes the
> >>recently introduced bugs.
> >
> >Just review it now, it's a two year old bug that was impacting the test
> >cases I was running... :-p Then we won't have to review it next week ;)
> 
> I'll review it, just split it out please so the CI fire can be
> extinguished first.

Sure, I'm pushing the fixes as soon as they've been baked. There's a
regression fix that builds upon this patch for multiple timelines (the
linear walk in reservation_object is particularly nasty for scaling).
-Chris
Tvrtko Ursulin Nov. 1, 2016, 9:20 a.m. UTC | #5
On 01/11/2016 09:06, Chris Wilson wrote:
> On Tue, Nov 01, 2016 at 08:54:09AM +0000, Tvrtko Ursulin wrote:
>>
>> On 01/11/2016 08:50, Chris Wilson wrote:
>>> On Tue, Nov 01, 2016 at 08:41:01AM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 31/10/2016 10:26, Chris Wilson wrote:
>>>>> With full-ppgtt one of the main bottlenecks is the lookup of the VMA
>>>>> underneath the object. For execbuf there is merit in having a very fast
>>>>> direct lookup of ctx:handle to the vma using a hashtree, but that still
>>>>> leaves a large number of other lookups. One way to speed up the lookup
>>>>> would be to use a rhashtable, but that requires extra allocations and
>>>>> may exhibit poor worse case behaviour. An alternative is to use an
>>>>> embedded rbtree, i.e. no extra allocations and deterministic behaviour,
>>>>> but at the slight cost of O(lgN) lookups (instead of O(1) for
>>>>> rhashtable). The major of such tree will be very shallow and so not much
>>>>> slower, and still scales much, much better than the current unsorted
>>>>> list.
>>>>>
>>>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>
>>>> I suggest leaving this out of the mini-series which fixes the
>>>> recently introduced bugs.
>>>
>>> Just review it now, it's a two year old bug that was impacting the test
>>> cases I was running... :-p Then we won't have to review it next week ;)
>>
>> I'll review it, just split it out please so the CI fire can be
>> extinguished first.
>
> Sure, I'm pushing the fixes as soon as they've been baked. There's a
> regression fix that builds upon this patch for multiple timelines (the
> linear walk in reservation_object is particularly nasty for scaling).

Which one is that, the reservation_object_get_fences_rcu in 
i915_gem_object_wait_reservation?

Regards,

Tvrtko
Tvrtko Ursulin Nov. 1, 2016, 9:43 a.m. UTC | #6
On 31/10/2016 10:26, Chris Wilson wrote:
> With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> underneath the object. For execbuf there is merit in having a very fast
> direct lookup of ctx:handle to the vma using a hashtree, but that still
> leaves a large number of other lookups. One way to speed up the lookup
> would be to use a rhashtable, but that requires extra allocations and
> may exhibit poor worse case behaviour. An alternative is to use an
> embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> but at the slight cost of O(lgN) lookups (instead of O(1) for
> rhashtable). The major of such tree will be very shallow and so not much
> slower, and still scales much, much better than the current unsorted
> list.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  1 +
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 80 +++++++++++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
>  3 files changed, 57 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7a18bf66f797..e923d6596cac 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2230,6 +2230,7 @@ struct drm_i915_gem_object {
>
>  	/** List of VMAs backed by this object */
>  	struct list_head vma_list;
> +	struct rb_root vma_tree;
>
>  	/** Stolen memory for this object, instead of being backed by shmem. */
>  	struct drm_mm_node *stolen;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index e7afad585929..aa2d21c41091 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -3399,6 +3399,7 @@ void i915_vma_destroy(struct i915_vma *vma)
>  	GEM_BUG_ON(!i915_vma_is_closed(vma));
>  	GEM_BUG_ON(vma->fence);
>
> +	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
>  	list_del(&vma->vm_link);
>  	if (!i915_vma_is_ggtt(vma))
>  		i915_ppgtt_put(i915_vm_to_ppgtt(vma->vm));
> @@ -3416,12 +3417,33 @@ void i915_vma_close(struct i915_vma *vma)
>  		WARN_ON(i915_vma_unbind(vma));
>  }
>
> +static inline int vma_compare(struct i915_vma *vma,
> +			      struct i915_address_space *vm,
> +			      const struct i915_ggtt_view *view)
> +{
> +	GEM_BUG_ON(view && !i915_vma_is_ggtt(vma));
> +
> +	if (vma->vm != vm)
> +		return vma->vm - vm;

This can theoretically overflow so I think long should be the return type.

> +
> +	if (!view)
> +		return vma->ggtt_view.type;
> +
> +	if (vma->ggtt_view.type != view->type)
> +		return vma->ggtt_view.type - view->type;
> +
> +	return memcmp(&vma->ggtt_view.params,
> +		      &view->params,
> +		      sizeof(view->params));
> +}
> +
>  static struct i915_vma *
>  __i915_vma_create(struct drm_i915_gem_object *obj,
>  		  struct i915_address_space *vm,
>  		  const struct i915_ggtt_view *view)
>  {
>  	struct i915_vma *vma;
> +	struct rb_node *rb, **p;
>  	int i;
>
>  	GEM_BUG_ON(vm->closed);
> @@ -3455,33 +3477,28 @@ __i915_vma_create(struct drm_i915_gem_object *obj,
>
>  	if (i915_is_ggtt(vm)) {
>  		vma->flags |= I915_VMA_GGTT;
> +		list_add(&vma->obj_link, &obj->vma_list);
>  	} else {
>  		i915_ppgtt_get(i915_vm_to_ppgtt(vm));
> +		list_add_tail(&vma->obj_link, &obj->vma_list);
>  	}

Will this make a difference to anything since it is not used for 
lookups? I suppose it makes a nicer debugfs output if nothing else so 
not complaining.

>
> -	list_add_tail(&vma->obj_link, &obj->vma_list);
> -	return vma;
> -}
> +	rb = NULL;
> +	p = &obj->vma_tree.rb_node;
> +	while (*p) {
> +		struct i915_vma *pos;
>
> -static inline bool vma_matches(struct i915_vma *vma,
> -			       struct i915_address_space *vm,
> -			       const struct i915_ggtt_view *view)
> -{
> -	if (vma->vm != vm)
> -		return false;
> -
> -	if (!i915_vma_is_ggtt(vma))
> -		return true;
> -
> -	if (!view)
> -		return vma->ggtt_view.type == 0;
> -
> -	if (vma->ggtt_view.type != view->type)
> -		return false;
> +		rb = *p;
> +		pos = rb_entry(rb, struct i915_vma, obj_node);
> +		if (vma_compare(pos, vm, view) < 0)
> +			p = &rb->rb_right;
> +		else
> +			p = &rb->rb_left;
> +	}
> +	rb_link_node(&vma->obj_node, rb, p);
> +	rb_insert_color(&vma->obj_node, &obj->vma_tree);
>
> -	return memcmp(&vma->ggtt_view.params,
> -		      &view->params,
> -		      sizeof(view->params)) == 0;
> +	return vma;
>  }
>
>  struct i915_vma *
> @@ -3501,11 +3518,22 @@ i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
>  		    struct i915_address_space *vm,
>  		    const struct i915_ggtt_view *view)
>  {
> -	struct i915_vma *vma;
> +	struct rb_node *rb;
> +
> +	rb = obj->vma_tree.rb_node;
> +	while (rb) {
> +		struct i915_vma *vma;
> +		int cmp;
>
> -	list_for_each_entry_reverse(vma, &obj->vma_list, obj_link)
> -		if (vma_matches(vma, vm, view))
> +		vma = rb_entry(rb, struct i915_vma, obj_node);
> +		cmp = vma_compare(vma, vm, view);
> +		if (cmp == 0)
>  			return vma;
> +		else if (cmp < 0)
> +			rb = rb->rb_right;
> +		else
> +			rb = rb->rb_left;
> +	}
>
>  	return NULL;
>  }
> @@ -3521,8 +3549,10 @@ i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
>  	GEM_BUG_ON(view && !i915_is_ggtt(vm));
>
>  	vma = i915_gem_obj_to_vma(obj, vm, view);
> -	if (!vma)
> +	if (!vma) {
>  		vma = __i915_vma_create(obj, vm, view);
> +		GEM_BUG_ON(vma != i915_gem_obj_to_vma(obj, vm, view));

Nice touch which makes the review so much easier! :)

> +	}
>
>  	GEM_BUG_ON(i915_vma_is_closed(vma));
>  	return vma;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 518e75b64290..c23ef9db1f53 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -227,6 +227,7 @@ struct i915_vma {
>  	struct list_head vm_link;
>
>  	struct list_head obj_link; /* Link in the object's VMA list */
> +	struct rb_node obj_node;
>
>  	/** This vma's place in the batchbuffer or on the eviction list */
>  	struct list_head exec_list;
>

LGTM in general.

Just the comparison return type (and accompanying locals) and then:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson Nov. 1, 2016, 9:45 a.m. UTC | #7
On Tue, Nov 01, 2016 at 09:20:33AM +0000, Tvrtko Ursulin wrote:
> 
> On 01/11/2016 09:06, Chris Wilson wrote:
> >On Tue, Nov 01, 2016 at 08:54:09AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 01/11/2016 08:50, Chris Wilson wrote:
> >>>On Tue, Nov 01, 2016 at 08:41:01AM +0000, Tvrtko Ursulin wrote:
> >>>>
> >>>>On 31/10/2016 10:26, Chris Wilson wrote:
> >>>>>With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> >>>>>underneath the object. For execbuf there is merit in having a very fast
> >>>>>direct lookup of ctx:handle to the vma using a hashtree, but that still
> >>>>>leaves a large number of other lookups. One way to speed up the lookup
> >>>>>would be to use a rhashtable, but that requires extra allocations and
> >>>>>may exhibit poor worse case behaviour. An alternative is to use an
> >>>>>embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> >>>>>but at the slight cost of O(lgN) lookups (instead of O(1) for
> >>>>>rhashtable). The major of such tree will be very shallow and so not much
> >>>>>slower, and still scales much, much better than the current unsorted
> >>>>>list.
> >>>>>
> >>>>>References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> >>>>>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>
> >>>>I suggest leaving this out of the mini-series which fixes the
> >>>>recently introduced bugs.
> >>>
> >>>Just review it now, it's a two year old bug that was impacting the test
> >>>cases I was running... :-p Then we won't have to review it next week ;)
> >>
> >>I'll review it, just split it out please so the CI fire can be
> >>extinguished first.
> >
> >Sure, I'm pushing the fixes as soon as they've been baked. There's a
> >regression fix that builds upon this patch for multiple timelines (the
> >linear walk in reservation_object is particularly nasty for scaling).
> 
> Which one is that, the reservation_object_get_fences_rcu in
> i915_gem_object_wait_reservation?

Mostly reservation_object_add_shared_fence()

The reservation object is not autopruning, those fences stay there until
replaced. So as an object is used by more and more contexts (such as
many GL clients sharing the same resource) that shared array keeps on
growing, and we have to walk over it on every execbuf, both to update
the fence and indeed often to compute the await.

We can use a radixtree/idr or a rhashtable here since they share the RCU
share lookup required for reservation_object (and couple in the seqlock
for update tracking), the stumbling point is the that
reservation_object_reserve_shared_fence() guarrantees the next
add_shared_fence() will not fail (ENOMEM). Both idr/radixtree have
preload, but both use per-cpu caches and so require preemption disabling
from the point of reservation to use; i.e. not suitable. Changing them
to preload onto a specific tree seems reasonably easy except for
convincing core of their merit. (There changing idr looks like it would
be easier.) radixtree is easier to use than idr, but iterating the
radixtree for (get_fences_rcu) is not as cheap as I expect. :|
-Chris
Chris Wilson Nov. 1, 2016, 9:56 a.m. UTC | #8
On Tue, Nov 01, 2016 at 09:43:53AM +0000, Tvrtko Ursulin wrote:
> 
> On 31/10/2016 10:26, Chris Wilson wrote:
> >With full-ppgtt one of the main bottlenecks is the lookup of the VMA
> >underneath the object. For execbuf there is merit in having a very fast
> >direct lookup of ctx:handle to the vma using a hashtree, but that still
> >leaves a large number of other lookups. One way to speed up the lookup
> >would be to use a rhashtable, but that requires extra allocations and
> >may exhibit poor worse case behaviour. An alternative is to use an
> >embedded rbtree, i.e. no extra allocations and deterministic behaviour,
> >but at the slight cost of O(lgN) lookups (instead of O(1) for
> >rhashtable). The major of such tree will be very shallow and so not much
> >slower, and still scales much, much better than the current unsorted
> >list.
> >
> >References: https://bugs.freedesktop.org/show_bug.cgi?id=87726
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_drv.h     |  1 +
> > drivers/gpu/drm/i915/i915_gem_gtt.c | 80 +++++++++++++++++++++++++------------
> > drivers/gpu/drm/i915/i915_gem_gtt.h |  1 +
> > 3 files changed, 57 insertions(+), 25 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 7a18bf66f797..e923d6596cac 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -2230,6 +2230,7 @@ struct drm_i915_gem_object {
> >
> > 	/** List of VMAs backed by this object */
> > 	struct list_head vma_list;
> >+	struct rb_root vma_tree;
> >
> > 	/** Stolen memory for this object, instead of being backed by shmem. */
> > 	struct drm_mm_node *stolen;
> >diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >index e7afad585929..aa2d21c41091 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> >@@ -3399,6 +3399,7 @@ void i915_vma_destroy(struct i915_vma *vma)
> > 	GEM_BUG_ON(!i915_vma_is_closed(vma));
> > 	GEM_BUG_ON(vma->fence);
> >
> >+	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
> > 	list_del(&vma->vm_link);
> > 	if (!i915_vma_is_ggtt(vma))
> > 		i915_ppgtt_put(i915_vm_to_ppgtt(vma->vm));
> >@@ -3416,12 +3417,33 @@ void i915_vma_close(struct i915_vma *vma)
> > 		WARN_ON(i915_vma_unbind(vma));
> > }
> >
> >+static inline int vma_compare(struct i915_vma *vma,
> >+			      struct i915_address_space *vm,
> >+			      const struct i915_ggtt_view *view)
> >+{
> >+	GEM_BUG_ON(view && !i915_vma_is_ggtt(vma));
> >+
> >+	if (vma->vm != vm)
> >+		return vma->vm - vm;
> 
> This can theoretically overflow so I think long should be the return type.

Yeah, the same thought cross through my mind. gcc should be smart enough
to only use the cc flags, but still...

I miss the spaceship operator.
 
> >+
> >+	if (!view)
> >+		return vma->ggtt_view.type;
> >+
> >+	if (vma->ggtt_view.type != view->type)
> >+		return vma->ggtt_view.type - view->type;
> >+
> >+	return memcmp(&vma->ggtt_view.params,
> >+		      &view->params,
> >+		      sizeof(view->params));
> >+}
> >+
> > static struct i915_vma *
> > __i915_vma_create(struct drm_i915_gem_object *obj,
> > 		  struct i915_address_space *vm,
> > 		  const struct i915_ggtt_view *view)
> > {
> > 	struct i915_vma *vma;
> >+	struct rb_node *rb, **p;
> > 	int i;
> >
> > 	GEM_BUG_ON(vm->closed);
> >@@ -3455,33 +3477,28 @@ __i915_vma_create(struct drm_i915_gem_object *obj,
> >
> > 	if (i915_is_ggtt(vm)) {
> > 		vma->flags |= I915_VMA_GGTT;
> >+		list_add(&vma->obj_link, &obj->vma_list);
> > 	} else {
> > 		i915_ppgtt_get(i915_vm_to_ppgtt(vm));
> >+		list_add_tail(&vma->obj_link, &obj->vma_list);
> > 	}
> 
> Will this make a difference to anything since it is not used for
> lookups? I suppose it makes a nicer debugfs output if nothing else
> so not complaining.

It does. The code assumes GGTT entries are first (some of my patches
came from a time before the regression was introduced by Daniel because
he claimed it made no difference but had not benchmarked it!).
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7a18bf66f797..e923d6596cac 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2230,6 +2230,7 @@  struct drm_i915_gem_object {
 
 	/** List of VMAs backed by this object */
 	struct list_head vma_list;
+	struct rb_root vma_tree;
 
 	/** Stolen memory for this object, instead of being backed by shmem. */
 	struct drm_mm_node *stolen;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e7afad585929..aa2d21c41091 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3399,6 +3399,7 @@  void i915_vma_destroy(struct i915_vma *vma)
 	GEM_BUG_ON(!i915_vma_is_closed(vma));
 	GEM_BUG_ON(vma->fence);
 
+	rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 	list_del(&vma->vm_link);
 	if (!i915_vma_is_ggtt(vma))
 		i915_ppgtt_put(i915_vm_to_ppgtt(vma->vm));
@@ -3416,12 +3417,33 @@  void i915_vma_close(struct i915_vma *vma)
 		WARN_ON(i915_vma_unbind(vma));
 }
 
+static inline int vma_compare(struct i915_vma *vma,
+			      struct i915_address_space *vm,
+			      const struct i915_ggtt_view *view)
+{
+	GEM_BUG_ON(view && !i915_vma_is_ggtt(vma));
+
+	if (vma->vm != vm)
+		return vma->vm - vm;
+
+	if (!view)
+		return vma->ggtt_view.type;
+
+	if (vma->ggtt_view.type != view->type)
+		return vma->ggtt_view.type - view->type;
+
+	return memcmp(&vma->ggtt_view.params,
+		      &view->params,
+		      sizeof(view->params));
+}
+
 static struct i915_vma *
 __i915_vma_create(struct drm_i915_gem_object *obj,
 		  struct i915_address_space *vm,
 		  const struct i915_ggtt_view *view)
 {
 	struct i915_vma *vma;
+	struct rb_node *rb, **p;
 	int i;
 
 	GEM_BUG_ON(vm->closed);
@@ -3455,33 +3477,28 @@  __i915_vma_create(struct drm_i915_gem_object *obj,
 
 	if (i915_is_ggtt(vm)) {
 		vma->flags |= I915_VMA_GGTT;
+		list_add(&vma->obj_link, &obj->vma_list);
 	} else {
 		i915_ppgtt_get(i915_vm_to_ppgtt(vm));
+		list_add_tail(&vma->obj_link, &obj->vma_list);
 	}
 
-	list_add_tail(&vma->obj_link, &obj->vma_list);
-	return vma;
-}
+	rb = NULL;
+	p = &obj->vma_tree.rb_node;
+	while (*p) {
+		struct i915_vma *pos;
 
-static inline bool vma_matches(struct i915_vma *vma,
-			       struct i915_address_space *vm,
-			       const struct i915_ggtt_view *view)
-{
-	if (vma->vm != vm)
-		return false;
-
-	if (!i915_vma_is_ggtt(vma))
-		return true;
-
-	if (!view)
-		return vma->ggtt_view.type == 0;
-
-	if (vma->ggtt_view.type != view->type)
-		return false;
+		rb = *p;
+		pos = rb_entry(rb, struct i915_vma, obj_node);
+		if (vma_compare(pos, vm, view) < 0)
+			p = &rb->rb_right;
+		else
+			p = &rb->rb_left;
+	}
+	rb_link_node(&vma->obj_node, rb, p);
+	rb_insert_color(&vma->obj_node, &obj->vma_tree);
 
-	return memcmp(&vma->ggtt_view.params,
-		      &view->params,
-		      sizeof(view->params)) == 0;
+	return vma;
 }
 
 struct i915_vma *
@@ -3501,11 +3518,22 @@  i915_gem_obj_to_vma(struct drm_i915_gem_object *obj,
 		    struct i915_address_space *vm,
 		    const struct i915_ggtt_view *view)
 {
-	struct i915_vma *vma;
+	struct rb_node *rb;
+
+	rb = obj->vma_tree.rb_node;
+	while (rb) {
+		struct i915_vma *vma;
+		int cmp;
 
-	list_for_each_entry_reverse(vma, &obj->vma_list, obj_link)
-		if (vma_matches(vma, vm, view))
+		vma = rb_entry(rb, struct i915_vma, obj_node);
+		cmp = vma_compare(vma, vm, view);
+		if (cmp == 0)
 			return vma;
+		else if (cmp < 0)
+			rb = rb->rb_right;
+		else
+			rb = rb->rb_left;
+	}
 
 	return NULL;
 }
@@ -3521,8 +3549,10 @@  i915_gem_obj_lookup_or_create_vma(struct drm_i915_gem_object *obj,
 	GEM_BUG_ON(view && !i915_is_ggtt(vm));
 
 	vma = i915_gem_obj_to_vma(obj, vm, view);
-	if (!vma)
+	if (!vma) {
 		vma = __i915_vma_create(obj, vm, view);
+		GEM_BUG_ON(vma != i915_gem_obj_to_vma(obj, vm, view));
+	}
 
 	GEM_BUG_ON(i915_vma_is_closed(vma));
 	return vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 518e75b64290..c23ef9db1f53 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -227,6 +227,7 @@  struct i915_vma {
 	struct list_head vm_link;
 
 	struct list_head obj_link; /* Link in the object's VMA list */
+	struct rb_node obj_node;
 
 	/** This vma's place in the batchbuffer or on the eviction list */
 	struct list_head exec_list;