diff mbox series

drm/i915: Serialise the fill BLT with the vma pinning

Message ID 20190820135232.31961-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915: Serialise the fill BLT with the vma pinning | expand

Commit Message

Chris Wilson Aug. 20, 2019, 1:52 p.m. UTC
Make sure that we wait for the vma to be pinned prior to telling the GPU
to fill the pages through that vma.

However, since our async operations fight over obj->resv->excl_fence we
must manually order them. This makes it much more fragile, and gives an
outside observer the chance to see the intermediate fences. To be
discussed!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_client_blt.c    | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

Comments

Matthew Auld Aug. 21, 2019, 11 a.m. UTC | #1
On 20/08/2019 14:52, Chris Wilson wrote:
> Make sure that we wait for the vma to be pinned prior to telling the GPU
> to fill the pages through that vma.
> 
> However, since our async operations fight over obj->resv->excl_fence we
> must manually order them. This makes it much more fragile, and gives an
> outside observer the chance to see the intermediate fences. To be
> discussed!
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
> ---
>   .../gpu/drm/i915/gem/i915_gem_client_blt.c    | 46 ++++++++++++++-----
>   1 file changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> index 3502071e1391..bbbc10499099 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> @@ -71,10 +71,30 @@ static struct i915_sleeve *create_sleeve(struct i915_address_space *vm,
>   		goto err_free;
>   	}
>   
> +	/*
> +	 * XXX fix scheduling with get_pages & clear workers
> +	 *
> +	 * The complication is that we end up overwriting the same
> +	 * obj->resv->excl_fence for each stage of the operation. That fence
> +	 * should be set on scheduling the work, and only signaled upon
> +	 * completion of the entire workqueue.
> +	 *
> +	 * Within the workqueue, we use the fence to schedule each individual
> +	 * task. Each individual task knows to use obj->resv->fence.
> +	 *
> +	 * To an outsider, they must wait until the end and so the
> +	 * obj->resv->fence must be the composite.
> +	 *
> +	 * Ideas?
> +	 */

I don't have any good ideas...

Are we meant to somehow have a "shadow" resv obj that we use for our 
intermediate pipelined ops, such that we don't leak anything?
		
Reviewed-by: Matthew Auld <matthew.auld@intel.com>

> +	err = i915_vma_pin(vma, 0, 0, PIN_USER);
> +	if (unlikely(err))
> +		goto err_free;
> +
>   	vma->private = sleeve;
>   	vma->ops = &proxy_vma_ops;
>   
> -	sleeve->vma = vma;
> +	sleeve->vma = i915_vma_get(vma);
>   	sleeve->pages = pages;
>   	sleeve->page_sizes = *page_sizes;
>   
> @@ -87,6 +107,13 @@ static struct i915_sleeve *create_sleeve(struct i915_address_space *vm,
>   
>   static void destroy_sleeve(struct i915_sleeve *sleeve)
>   {
> +	struct i915_vma *vma = sleeve->vma;
> +
> +	if (vma) {
> +		i915_vma_unpin(vma);
> +		i915_vma_put(vma);
> +	}
> +
>   	kfree(sleeve);
>   }
>   
> @@ -155,8 +182,8 @@ static void clear_pages_dma_fence_cb(struct dma_fence *fence,
>   static void clear_pages_worker(struct work_struct *work)
>   {
>   	struct clear_pages_work *w = container_of(work, typeof(*w), work);
> -	struct drm_i915_gem_object *obj = w->sleeve->vma->obj;
> -	struct i915_vma *vma = w->sleeve->vma;
> +	struct i915_vma *vma = fetch_and_zero(&w->sleeve->vma);
> +	struct drm_i915_gem_object *obj = vma->obj;
>   	struct i915_request *rq;
>   	struct i915_vma *batch;
>   	int err = w->dma.error;
> @@ -166,20 +193,16 @@ static void clear_pages_worker(struct work_struct *work)
>   
>   	if (obj->cache_dirty) {
>   		if (i915_gem_object_has_struct_page(obj))
> -			drm_clflush_sg(w->sleeve->pages);
> +			drm_clflush_sg(vma->pages);
>   		obj->cache_dirty = false;
>   	}
>   	obj->read_domains = I915_GEM_GPU_DOMAINS;
>   	obj->write_domain = 0;
>   
> -	err = i915_vma_pin(vma, 0, 0, PIN_USER);
> -	if (unlikely(err))
> -		goto out_signal;
> -
>   	batch = intel_emit_vma_fill_blt(w->ce, vma, w->value);
>   	if (IS_ERR(batch)) {
>   		err = PTR_ERR(batch);
> -		goto out_unpin;
> +		goto out_signal;
>   	}
>   
>   	rq = intel_context_create_request(w->ce);
> @@ -224,14 +247,15 @@ static void clear_pages_worker(struct work_struct *work)
>   	i915_request_add(rq);
>   out_batch:
>   	intel_emit_vma_release(w->ce, batch);
> -out_unpin:
> -	i915_vma_unpin(vma);
>   out_signal:
>   	if (unlikely(err)) {
>   		dma_fence_set_error(&w->dma, err);
>   		dma_fence_signal(&w->dma);
>   		dma_fence_put(&w->dma);
>   	}
> +
> +	i915_vma_unpin(vma);
> +	i915_vma_put(vma);
>   }
>   
>   static int __i915_sw_fence_call
>
Chris Wilson Aug. 21, 2019, 11:13 a.m. UTC | #2
Quoting Matthew Auld (2019-08-21 12:00:07)
> On 20/08/2019 14:52, Chris Wilson wrote:
> > Make sure that we wait for the vma to be pinned prior to telling the GPU
> > to fill the pages through that vma.
> > 
> > However, since our async operations fight over obj->resv->excl_fence we
> > must manually order them. This makes it much more fragile, and gives an
> > outside observer the chance to see the intermediate fences. To be
> > discussed!
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Matthew Auld <matthew.auld@intel.com>
> > ---
> >   .../gpu/drm/i915/gem/i915_gem_client_blt.c    | 46 ++++++++++++++-----
> >   1 file changed, 35 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> > index 3502071e1391..bbbc10499099 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
> > @@ -71,10 +71,30 @@ static struct i915_sleeve *create_sleeve(struct i915_address_space *vm,
> >               goto err_free;
> >       }
> >   
> > +     /*
> > +      * XXX fix scheduling with get_pages & clear workers
> > +      *
> > +      * The complication is that we end up overwriting the same
> > +      * obj->resv->excl_fence for each stage of the operation. That fence
> > +      * should be set on scheduling the work, and only signaled upon
> > +      * completion of the entire workqueue.
> > +      *
> > +      * Within the workqueue, we use the fence to schedule each individual
> > +      * task. Each individual task knows to use obj->resv->fence.
> > +      *
> > +      * To an outsider, they must wait until the end and so the
> > +      * obj->resv->fence must be the composite.
> > +      *
> > +      * Ideas?
> > +      */
> 
> I don't have any good ideas...
> 
> Are we meant to somehow have a "shadow" resv obj that we use for our 
> intermediate pipelined ops, such that we don't leak anything?

A sketch I haven't put any code to is a struct chain_fence {
	struct dma_fence base;
	struct dma_fence *prev;
	struct dma_fence_cb cb;
	/* if only dma_chain_fence wan't already taken, maybe dma_composite_fence */
 }; that we insert as dma_resv_add_excl_fence() and then
pass to all the async operations that then chain to. It means more work
in making sure we have the fence available at all times, all the way
down the chain, but it should work (now and in the future)... Give or
take various factors to make sure we don't wait on the same fence while
building it (eviction and shrinking).
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
index 3502071e1391..bbbc10499099 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c
@@ -71,10 +71,30 @@  static struct i915_sleeve *create_sleeve(struct i915_address_space *vm,
 		goto err_free;
 	}
 
+	/*
+	 * XXX fix scheduling with get_pages & clear workers
+	 *
+	 * The complication is that we end up overwriting the same
+	 * obj->resv->excl_fence for each stage of the operation. That fence
+	 * should be set on scheduling the work, and only signaled upon
+	 * completion of the entire workqueue.
+	 *
+	 * Within the workqueue, we use the fence to schedule each individual
+	 * task. Each individual task knows to use obj->resv->fence.
+	 *
+	 * To an outsider, they must wait until the end and so the
+	 * obj->resv->fence must be the composite.
+	 *
+	 * Ideas?
+	 */
+	err = i915_vma_pin(vma, 0, 0, PIN_USER);
+	if (unlikely(err))
+		goto err_free;
+
 	vma->private = sleeve;
 	vma->ops = &proxy_vma_ops;
 
-	sleeve->vma = vma;
+	sleeve->vma = i915_vma_get(vma);
 	sleeve->pages = pages;
 	sleeve->page_sizes = *page_sizes;
 
@@ -87,6 +107,13 @@  static struct i915_sleeve *create_sleeve(struct i915_address_space *vm,
 
 static void destroy_sleeve(struct i915_sleeve *sleeve)
 {
+	struct i915_vma *vma = sleeve->vma;
+
+	if (vma) {
+		i915_vma_unpin(vma);
+		i915_vma_put(vma);
+	}
+
 	kfree(sleeve);
 }
 
@@ -155,8 +182,8 @@  static void clear_pages_dma_fence_cb(struct dma_fence *fence,
 static void clear_pages_worker(struct work_struct *work)
 {
 	struct clear_pages_work *w = container_of(work, typeof(*w), work);
-	struct drm_i915_gem_object *obj = w->sleeve->vma->obj;
-	struct i915_vma *vma = w->sleeve->vma;
+	struct i915_vma *vma = fetch_and_zero(&w->sleeve->vma);
+	struct drm_i915_gem_object *obj = vma->obj;
 	struct i915_request *rq;
 	struct i915_vma *batch;
 	int err = w->dma.error;
@@ -166,20 +193,16 @@  static void clear_pages_worker(struct work_struct *work)
 
 	if (obj->cache_dirty) {
 		if (i915_gem_object_has_struct_page(obj))
-			drm_clflush_sg(w->sleeve->pages);
+			drm_clflush_sg(vma->pages);
 		obj->cache_dirty = false;
 	}
 	obj->read_domains = I915_GEM_GPU_DOMAINS;
 	obj->write_domain = 0;
 
-	err = i915_vma_pin(vma, 0, 0, PIN_USER);
-	if (unlikely(err))
-		goto out_signal;
-
 	batch = intel_emit_vma_fill_blt(w->ce, vma, w->value);
 	if (IS_ERR(batch)) {
 		err = PTR_ERR(batch);
-		goto out_unpin;
+		goto out_signal;
 	}
 
 	rq = intel_context_create_request(w->ce);
@@ -224,14 +247,15 @@  static void clear_pages_worker(struct work_struct *work)
 	i915_request_add(rq);
 out_batch:
 	intel_emit_vma_release(w->ce, batch);
-out_unpin:
-	i915_vma_unpin(vma);
 out_signal:
 	if (unlikely(err)) {
 		dma_fence_set_error(&w->dma, err);
 		dma_fence_signal(&w->dma);
 		dma_fence_put(&w->dma);
 	}
+
+	i915_vma_unpin(vma);
+	i915_vma_put(vma);
 }
 
 static int __i915_sw_fence_call