[3/4] drm/i915: Emit pipelined fence changes

Message ID	20170703101412.30820-3-chris@chris-wilson.co.uk (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Chris Wilson <chris@chris-wilson.co.uk> To: intel-gfx@lists.freedesktop.org Date: Mon, 3 Jul 2017 11:14:11 +0100 Message-Id: <20170703101412.30820-3-chris@chris-wilson.co.uk> In-Reply-To: <20170703101412.30820-1-chris@chris-wilson.co.uk> References: <20170703101412.30820-1-chris@chris-wilson.co.uk> Subject: [Intel-gfx] [PATCH 3/4] drm/i915: Emit pipelined fence changes Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Message ID

20170703101412.30820-3-chris@chris-wilson.co.uk (mailing list archive)

State

New, archived

Headers

From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Date: Mon,  3 Jul 2017 11:14:11 +0100
Message-Id: <20170703101412.30820-3-chris@chris-wilson.co.uk>
In-Reply-To: <20170703101412.30820-1-chris@chris-wilson.co.uk>
References: <20170703101412.30820-1-chris@chris-wilson.co.uk>
Subject: [Intel-gfx] [PATCH 3/4] drm/i915: Emit pipelined fence changes
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Commit Message

Chris Wilson July 3, 2017, 10:14 a.m. UTC

Many years ago, long before requests, we tried doing this. We never
quite got it right, but now with requests we have the tracking to do the
job properly!

One of the stall points for gen2/gen3 is the use of fence registers for
GPU operations. There are only a few available, and currently if we
exhaust the available fence register we must stall the GPU between
batches. By pipelining the fence register writes, we can avoid the stall
and continuously emit the batches. The challenge is remembering to wait
for those pipelined LRI before accessing the fence with the CPU, and
that is what our request tracking makes easy.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  15 ++-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 193 +++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_fence_reg.h  |   1 +
 drivers/gpu/drm/i915/i915_vma.c            |   1 +
 drivers/gpu/drm/i915/i915_vma.h            |   4 +
 6 files changed, 186 insertions(+), 29 deletions(-)

Comments

Tvrtko Ursulin July 5, 2017, 10:19 a.m. UTC | #1

On 03/07/2017 11:14, Chris Wilson wrote:
> Many years ago, long before requests, we tried doing this. We never
> quite got it right, but now with requests we have the tracking to do the
> job properly!

Add a few words on the benefits in certain use cases/benchmarks.

> 
> One of the stall points for gen2/gen3 is the use of fence registers for
> GPU operations. There are only a few available, and currently if we
> exhaust the available fence register we must stall the GPU between
> batches. By pipelining the fence register writes, we can avoid the stall
> and continuously emit the batches. The challenge is remembering to wait
> for those pipelined LRI before accessing the fence with the CPU, and
> that is what our request tracking makes easy.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem.c            |   1 +
>   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  15 ++-
>   drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 193 +++++++++++++++++++++++++----
>   drivers/gpu/drm/i915/i915_gem_fence_reg.h  |   1 +
>   drivers/gpu/drm/i915/i915_vma.c            |   1 +
>   drivers/gpu/drm/i915/i915_vma.h            |   4 +
>   6 files changed, 186 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 939a299260e9..5318e321ce50 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4880,6 +4880,7 @@ i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
>   		fence->i915 = dev_priv;
>   		fence->id = i;
>   		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
> +		init_request_active(&fence->pipelined, NULL);
>   	}
>   	i915_gem_restore_fences(dev_priv);
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 22a9f5358322..19347ad0e7ac 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -365,11 +365,12 @@ eb_pin_vma(struct i915_execbuffer *eb,
>   		return;
>   
>   	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> -		if (unlikely(i915_vma_pin_fence(vma))) {
> +		if (unlikely(i915_vma_reserve_fence(vma))) {
>   			i915_vma_unpin(vma);
>   			return;
>   		}
>   
> +		entry->flags &= ~EXEC_OBJECT_ASYNC;
>   		if (vma->fence)
>   			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
>   	}
> @@ -564,12 +565,13 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
>   	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
>   
>   	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> -		err = i915_vma_pin_fence(vma);
> +		err = i915_vma_reserve_fence(vma);
>   		if (unlikely(err)) {
>   			i915_vma_unpin(vma);
>   			return err;
>   		}
>   
> +		entry->flags &= ~EXEC_OBJECT_ASYNC;

A comment here would be good.

>   		if (vma->fence)
>   			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
>   	}
> @@ -1848,6 +1850,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
>   		if (unlikely(obj->cache_dirty && !obj->cache_coherent))
>   			i915_gem_clflush_object(obj, 0);
>   
> +		if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> +			err = i915_vma_emit_pipelined_fence(vma, eb->request);
> +			if (err)
> +				return err;
> +		}
> +
>   		err = i915_gem_request_await_object
>   			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
>   		if (err)
> @@ -1932,9 +1940,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>   		obj->base.read_domains = 0;
>   	}
>   	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> -
> -	if (flags & EXEC_OBJECT_NEEDS_FENCE)
> -		i915_gem_active_set(&vma->last_fence, req);
>   }
>   
>   static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index 55ac7bc14fce..d0cd051c19fd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -55,10 +55,9 @@
>    * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
>    */
>   
> -#define pipelined 0
> -
> -static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> -				 struct i915_vma *vma)
> +static int i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> +				struct i915_vma *vma,
> +				struct drm_i915_gem_request *pipelined)
>   {
>   	i915_reg_t fence_reg_lo, fence_reg_hi;
>   	int fence_pitch_shift;
> @@ -110,11 +109,30 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
>   		I915_WRITE(fence_reg_hi, upper_32_bits(val));
>   		I915_WRITE(fence_reg_lo, lower_32_bits(val));
>   		POSTING_READ(fence_reg_lo);
> +	} else {
> +		u32 *cs;
> +
> +		cs = intel_ring_begin(pipelined, 8);

Do we need to adjust the ring space reservation amount?

> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_LOAD_REGISTER_IMM(3);
> +		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
> +		*cs++ = 0;
> +		*cs++ = i915_mmio_reg_offset(fence_reg_hi);
> +		*cs++ = upper_32_bits(val);
> +		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
> +		*cs++ = lower_32_bits(val);
> +		*cs++ = MI_NOOP;
> +		intel_ring_advance(pipelined, cs);
>   	}
> +
> +	return 0;
>   }
>   
> -static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
> -				 struct i915_vma *vma)
> +static int i915_write_fence_reg(struct drm_i915_fence_reg *fence,
> +				struct i915_vma *vma,
> +				struct drm_i915_gem_request *pipelined)
>   {
>   	u32 val;
>   
> @@ -150,11 +168,26 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
>   
>   		I915_WRITE(reg, val);
>   		POSTING_READ(reg);
> +	} else {
> +		u32 *cs;
> +
> +		cs = intel_ring_begin(pipelined, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_LOAD_REGISTER_IMM(1);
> +		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
> +		*cs++ = val;
> +		*cs++ = MI_NOOP;
> +		intel_ring_advance(pipelined, cs);
>   	}
> +
> +	return 0;
>   }
>   
> -static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
> -				 struct i915_vma *vma)
> +static int i830_write_fence_reg(struct drm_i915_fence_reg *fence,
> +				struct i915_vma *vma,
> +				struct drm_i915_gem_request *pipelined)
>   {
>   	u32 val;
>   
> @@ -182,29 +215,49 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
>   
>   		I915_WRITE(reg, val);
>   		POSTING_READ(reg);
> +	} else {
> +		u32 *cs;
> +
> +		cs = intel_ring_begin(pipelined, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_LOAD_REGISTER_IMM(1);
> +		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
> +		*cs++ = val;
> +		*cs++ = MI_NOOP;
> +		intel_ring_advance(pipelined, cs);
>   	}
> +
> +	return 0;
>   }
>   
> -static void fence_write(struct drm_i915_fence_reg *fence,
> -			struct i915_vma *vma)
> +static int fence_write(struct drm_i915_fence_reg *fence,
> +		       struct i915_vma *vma,
> +		       struct drm_i915_gem_request *rq)
>   {
> +	int err;
> +
>   	/* Previous access through the fence register is marshalled by
>   	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
>   	 * and explicitly managed for internal users.
>   	 */
>   
>   	if (IS_GEN2(fence->i915))
> -		i830_write_fence_reg(fence, vma);
> +		err = i830_write_fence_reg(fence, vma, rq);
>   	else if (IS_GEN3(fence->i915))
> -		i915_write_fence_reg(fence, vma);
> +		err = i915_write_fence_reg(fence, vma, rq);
>   	else
> -		i965_write_fence_reg(fence, vma);
> +		err = i965_write_fence_reg(fence, vma, rq);
> +	if (err)
> +		return err;
>   
>   	/* Access through the fenced region afterwards is
>   	 * ordered by the posting reads whilst writing the registers.
>   	 */
>   
>   	fence->dirty = false;
> +	return 0;
>   }
>   
>   static int fence_update(struct drm_i915_fence_reg *fence,
> @@ -212,17 +265,15 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>   {
>   	int ret;
>   
> +	ret = i915_gem_active_retire(&fence->pipelined,
> +				     &fence->i915->drm.struct_mutex) > +	if (ret)
> +		return ret;
> +
>   	if (vma) {
>   		if (!i915_vma_is_map_and_fenceable(vma))
>   			return -EINVAL;
>   
> -		if (WARN(!i915_gem_object_get_stride(vma->obj) ||
> -			 !i915_gem_object_get_tiling(vma->obj),
> -			 "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
> -			 i915_gem_object_get_stride(vma->obj),
> -			 i915_gem_object_get_tiling(vma->obj)))
> -			return -EINVAL;
> -
>   		ret = i915_gem_active_retire(&vma->last_fence,
>   					     &vma->obj->base.dev->struct_mutex);
>   		if (ret)
> @@ -253,7 +304,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>   	 * to the runtime resume, see i915_gem_restore_fences().
>   	 */
>   	if (intel_runtime_pm_get_if_in_use(fence->i915)) {
> -		fence_write(fence, vma);
> +		fence_write(fence, vma, NULL);
>   		intel_runtime_pm_put(fence->i915);
>   	}
>   
> @@ -287,6 +338,8 @@ int i915_vma_put_fence(struct i915_vma *vma)
>   	if (!fence)
>   		return 0;
>   
> +	GEM_BUG_ON(fence->vma != vma);
> +
>   	if (fence->pin_count)
>   		return -EBUSY;
>   
> @@ -344,11 +397,16 @@ i915_vma_pin_fence(struct i915_vma *vma)
>   	assert_rpm_wakelock_held(vma->vm->i915);
>   
>   	/* Just update our place in the LRU if our fence is getting reused. */
> -	if (vma->fence) {
> -		fence = vma->fence;
> +	fence = vma->fence;
> +	if (fence) {
>   		GEM_BUG_ON(fence->vma != vma);
>   		fence->pin_count++;
>   		if (!fence->dirty) {
> +			err = i915_gem_active_retire(&fence->pipelined,
> +						     &fence->i915->drm.struct_mutex);
> +			if (err)
> +				goto err_unpin;

Comment explaining the need for this block. There is one CPU wait on the 
fence_update path already. At least I couldn't easily figure it out. And 
why also in the !dirty case specifically?

> +
>   			list_move_tail(&fence->link,
>   				       &fence->i915->mm.fence_list);
>   			return 0;
> @@ -372,6 +430,93 @@ i915_vma_pin_fence(struct i915_vma *vma)
>   	return err;
>   }
>   
> +int i915_vma_reserve_fence(struct i915_vma *vma)
> +{
> +	struct drm_i915_fence_reg *fence;
> +
> +	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> +	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
> +	GEM_BUG_ON(!i915_vma_is_pinned(vma));
> +
> +	fence = vma->fence;
> +	if (!fence) {
> +		if (!i915_gem_object_is_tiled(vma->obj))
> +			return 0;
> +
> +		if (!i915_vma_is_map_and_fenceable(vma))
> +			return -EINVAL;
> +
> +		fence = fence_find(vma->vm->i915);
> +		if (IS_ERR(fence))
> +			return PTR_ERR(fence);

If all fences are pinned via i915_gem_fault which I thought can happen 
with the previous patch then here we error out instead of waiting?

> +
> +		vma->fence = fence;
> +
> +		if (fence->vma) {
> +			i915_gem_release_mmap(fence->vma->obj);
> +			fence->vma->fence = NULL;
> +		}
> +		fence->vma = vma;
> +		fence->dirty = true;
> +	}
> +	fence->pin_count++;
> +	list_move_tail(&fence->link, &fence->i915->mm.fence_list);
> +
> +	GEM_BUG_ON(!i915_gem_object_is_tiled(vma->obj));
> +	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
> +	GEM_BUG_ON(vma->node.size != vma->fence_size);
> +	GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_alignment));
> +
> +	return 0;
> +}
> +
> +int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
> +				  struct drm_i915_gem_request *rq)
> +{
> +	struct drm_i915_fence_reg *fence = vma->fence;
> +	struct drm_i915_gem_request *prev;
> +	int err;
> +
> +	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> +	GEM_BUG_ON(fence && !fence->pin_count);
> +
> +	if (!fence)
> +		goto out;
> +
> +	prev = i915_gem_active_raw(&fence->pipelined,
> +				   &fence->i915->drm.struct_mutex);
> +	if (prev) {
> +		err = i915_gem_request_await_dma_fence(rq, &prev->fence);
> +		if (err)
> +			return err;
> +	}
> +
> +	if (!fence->dirty)
> +		goto out;

Hm so unless "dirty" no actual fence management will happen. What is the 
meaning of dirty? Has it changed? Does it needs a write up in the header 
file?

Looks like in any case I will need to apply this to follow the flow.

Regards,

Tvrtko

> +
> +	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
> +
> +	if (fence->vma) {
> +		prev = i915_gem_active_raw(&fence->vma->last_fence,
> +					   &fence->i915->drm.struct_mutex);
> +		if (prev) {
> +			err = i915_gem_request_await_dma_fence(rq,
> +							       &prev->fence);
> +			if (err)
> +				return err;
> +		}
> +	}
> +
> +	err = fence_write(fence, vma, rq);
> +	if (err)
> +		return err;
> +
> +	i915_gem_active_set(&fence->pipelined, rq);
> +out:
> +	i915_gem_active_set(&vma->last_fence, rq);
> +	return 0;
> +}
> +
>   /**
>    * i915_gem_revoke_fences - revoke fence state
>    * @dev_priv: i915 device private
> @@ -429,7 +574,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>   			vma = NULL;
>   		}
>   
> -		fence_write(reg, vma);
> +		fence_write(reg, vma, NULL);
>   		reg->vma = vma;
>   	}
>   }
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> index 99a31ded4dfd..ce45972fc5c6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> @@ -47,6 +47,7 @@ struct drm_i915_fence_reg {
>   	 * command (such as BLT on gen2/3), as a "fence".
>   	 */
>   	bool dirty;
> +	struct i915_gem_active pipelined;
>   };
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index efbfee8eac99..0c489090d4ab 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -728,6 +728,7 @@ int i915_vma_unbind(struct i915_vma *vma)
>   		__i915_vma_iounmap(vma);
>   		vma->flags &= ~I915_VMA_CAN_FENCE;
>   	}
> +	GEM_BUG_ON(vma->fence);
>   
>   	if (likely(!vma->vm->closed)) {
>   		trace_i915_vma_unbind(vma);
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 19f58af4f1bf..f0dc6eaebeab 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -367,5 +367,9 @@ i915_vma_unpin_fence(struct i915_vma *vma)
>   		__i915_vma_unpin_fence(vma);
>   }
>   
> +int __must_check i915_vma_reserve_fence(struct i915_vma *vma);
> +int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
> +				  struct drm_i915_gem_request *rq);
> +
>   #endif
>   
>

Chris Wilson July 6, 2017, 11:35 a.m. UTC | #2

Quoting Tvrtko Ursulin (2017-07-05 11:19:43)
> 
> On 03/07/2017 11:14, Chris Wilson wrote:
> > Many years ago, long before requests, we tried doing this. We never
> > quite got it right, but now with requests we have the tracking to do the
> > job properly!
> 
> Add a few words on the benefits in certain use cases/benchmarks.

A few percent in padman on Pineview. One presumes that it would have a
better impact on minimal framerates than average but I didn't check.

> > One of the stall points for gen2/gen3 is the use of fence registers for
> > GPU operations. There are only a few available, and currently if we
> > exhaust the available fence register we must stall the GPU between
> > batches. By pipelining the fence register writes, we can avoid the stall
> > and continuously emit the batches. The challenge is remembering to wait
> > for those pipelined LRI before accessing the fence with the CPU, and
> > that is what our request tracking makes easy.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/i915_gem.c            |   1 +
> >   drivers/gpu/drm/i915/i915_gem_execbuffer.c |  15 ++-
> >   drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 193 +++++++++++++++++++++++++----
> >   drivers/gpu/drm/i915/i915_gem_fence_reg.h  |   1 +
> >   drivers/gpu/drm/i915/i915_vma.c            |   1 +
> >   drivers/gpu/drm/i915/i915_vma.h            |   4 +
> >   6 files changed, 186 insertions(+), 29 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 939a299260e9..5318e321ce50 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4880,6 +4880,7 @@ i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
> >               fence->i915 = dev_priv;
> >               fence->id = i;
> >               list_add_tail(&fence->link, &dev_priv->mm.fence_list);
> > +             init_request_active(&fence->pipelined, NULL);
> >       }
> >       i915_gem_restore_fences(dev_priv);
> >   
> > diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > index 22a9f5358322..19347ad0e7ac 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> > @@ -365,11 +365,12 @@ eb_pin_vma(struct i915_execbuffer *eb,
> >               return;
> >   
> >       if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> > -             if (unlikely(i915_vma_pin_fence(vma))) {
> > +             if (unlikely(i915_vma_reserve_fence(vma))) {
> >                       i915_vma_unpin(vma);
> >                       return;
> >               }
> >   
> > +             entry->flags &= ~EXEC_OBJECT_ASYNC;
> >               if (vma->fence)
> >                       entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> >       }
> > @@ -564,12 +565,13 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
> >       GEM_BUG_ON(eb_vma_misplaced(entry, vma));
> >   
> >       if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> > -             err = i915_vma_pin_fence(vma);
> > +             err = i915_vma_reserve_fence(vma);
> >               if (unlikely(err)) {
> >                       i915_vma_unpin(vma);
> >                       return err;
> >               }
> >   
> > +             entry->flags &= ~EXEC_OBJECT_ASYNC;
> 
> A comment here would be good.

We rely on the serialisation between LRI, that the user has no knowledge
over and so we have to ignore any request to skip the required LRI.

> 
> >               if (vma->fence)
> >                       entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> >       }
> > @@ -1848,6 +1850,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
> >               if (unlikely(obj->cache_dirty && !obj->cache_coherent))
> >                       i915_gem_clflush_object(obj, 0);
> >   
> > +             if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
> > +                     err = i915_vma_emit_pipelined_fence(vma, eb->request);
> > +                     if (err)
> > +                             return err;
> > +             }
> > +
> >               err = i915_gem_request_await_object
> >                       (eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
> >               if (err)
> > @@ -1932,9 +1940,6 @@ void i915_vma_move_to_active(struct i915_vma *vma,
> >               obj->base.read_domains = 0;
> >       }
> >       obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> > -
> > -     if (flags & EXEC_OBJECT_NEEDS_FENCE)
> > -             i915_gem_active_set(&vma->last_fence, req);
> >   }
> >   
> >   static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
> > diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > index 55ac7bc14fce..d0cd051c19fd 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > @@ -55,10 +55,9 @@
> >    * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
> >    */
> >   
> > -#define pipelined 0
> > -
> > -static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> > -                              struct i915_vma *vma)
> > +static int i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> > +                             struct i915_vma *vma,
> > +                             struct drm_i915_gem_request *pipelined)
> >   {
> >       i915_reg_t fence_reg_lo, fence_reg_hi;
> >       int fence_pitch_shift;
> > @@ -110,11 +109,30 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
> >               I915_WRITE(fence_reg_hi, upper_32_bits(val));
> >               I915_WRITE(fence_reg_lo, lower_32_bits(val));
> >               POSTING_READ(fence_reg_lo);
> > +     } else {
> > +             u32 *cs;
> > +
> > +             cs = intel_ring_begin(pipelined, 8);
> 
> Do we need to adjust the ring space reservation amount?

No. We don't reset the registers at the end of a request, but either
emit a new request or serialise with mmio from the CPU.

> > @@ -344,11 +397,16 @@ i915_vma_pin_fence(struct i915_vma *vma)
> >       assert_rpm_wakelock_held(vma->vm->i915);
> >   
> >       /* Just update our place in the LRU if our fence is getting reused. */
> > -     if (vma->fence) {
> > -             fence = vma->fence;
> > +     fence = vma->fence;
> > +     if (fence) {
> >               GEM_BUG_ON(fence->vma != vma);
> >               fence->pin_count++;
> >               if (!fence->dirty) {
> > +                     err = i915_gem_active_retire(&fence->pipelined,
> > +                                                  &fence->i915->drm.struct_mutex);
> > +                     if (err)
> > +                             goto err_unpin;
> 
> Comment explaining the need for this block. There is one CPU wait on the 
> fence_update path already. At least I couldn't easily figure it out. And 
> why also in the !dirty case specifically?

!dirty is a shortcut, but here we only need to wait for the LRI (to set
the fence register) itself and not for the subsequent usage of the fence.

It is strictly an earlier point than the fence->vma->last_fence request.
(Hmm, strictly is a bit of an exaggeration, it does assume single
timeline gen3.)

> > +int i915_vma_reserve_fence(struct i915_vma *vma)
> > +{
> > +     struct drm_i915_fence_reg *fence;
> > +
> > +     lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> > +     GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
> > +     GEM_BUG_ON(!i915_vma_is_pinned(vma));
> > +
> > +     fence = vma->fence;
> > +     if (!fence) {
> > +             if (!i915_gem_object_is_tiled(vma->obj))
> > +                     return 0;
> > +
> > +             if (!i915_vma_is_map_and_fenceable(vma))
> > +                     return -EINVAL;
> > +
> > +             fence = fence_find(vma->vm->i915);
> > +             if (IS_ERR(fence))
> > +                     return PTR_ERR(fence);
> 
> If all fences are pinned via i915_gem_fault which I thought can happen 
> with the previous patch then here we error out instead of waiting?

i915_gem_fault doesn't pin a page except for the duration it is
inserting the CPU PTE. It is expected that we do hit an error here
(EDEADLK) for exhaustion, a single batch using more than is available.
Currently, atomic kms has a huge regression in that it doesn't
report the fences that it is only temporarily holding.

> > +
> > +     if (!fence->dirty)
> > +             goto out;
> 
> Hm so unless "dirty" no actual fence management will happen. What is the 
> meaning of dirty? Has it changed? Does it needs a write up in the header 
> file?

It means the register no longer matches expectation and we need to
update it before use.
-Chris

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 939a299260e9..5318e321ce50 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4880,6 +4880,7 @@  i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
 		fence->i915 = dev_priv;
 		fence->id = i;
 		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
+		init_request_active(&fence->pipelined, NULL);
 	}
 	i915_gem_restore_fences(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 22a9f5358322..19347ad0e7ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -365,11 +365,12 @@  eb_pin_vma(struct i915_execbuffer *eb,
 		return;
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		if (unlikely(i915_vma_pin_fence(vma))) {
+		if (unlikely(i915_vma_reserve_fence(vma))) {
 			i915_vma_unpin(vma);
 			return;
 		}
 
+		entry->flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
@@ -564,12 +565,13 @@  static int eb_reserve_vma(const struct i915_execbuffer *eb,
 	GEM_BUG_ON(eb_vma_misplaced(entry, vma));
 
 	if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
-		err = i915_vma_pin_fence(vma);
+		err = i915_vma_reserve_fence(vma);
 		if (unlikely(err)) {
 			i915_vma_unpin(vma);
 			return err;
 		}
 
+		entry->flags &= ~EXEC_OBJECT_ASYNC;
 		if (vma->fence)
 			entry->flags |= __EXEC_OBJECT_HAS_FENCE;
 	}
@@ -1848,6 +1850,12 @@  static int eb_move_to_gpu(struct i915_execbuffer *eb)
 		if (unlikely(obj->cache_dirty && !obj->cache_coherent))
 			i915_gem_clflush_object(obj, 0);
 
+		if (unlikely(entry->flags & EXEC_OBJECT_NEEDS_FENCE)) {
+			err = i915_vma_emit_pipelined_fence(vma, eb->request);
+			if (err)
+				return err;
+		}
+
 		err = i915_gem_request_await_object
 			(eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
 		if (err)
@@ -1932,9 +1940,6 @@  void i915_vma_move_to_active(struct i915_vma *vma,
 		obj->base.read_domains = 0;
 	}
 	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
-
-	if (flags & EXEC_OBJECT_NEEDS_FENCE)
-		i915_gem_active_set(&vma->last_fence, req);
 }
 
 static int i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 55ac7bc14fce..d0cd051c19fd 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -55,10 +55,9 @@ 
  * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
  */
 
-#define pipelined 0
-
-static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i965_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	i915_reg_t fence_reg_lo, fence_reg_hi;
 	int fence_pitch_shift;
@@ -110,11 +109,30 @@  static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
 		I915_WRITE(fence_reg_hi, upper_32_bits(val));
 		I915_WRITE(fence_reg_lo, lower_32_bits(val));
 		POSTING_READ(fence_reg_lo);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 8);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(3);
+		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
+		*cs++ = 0;
+		*cs++ = i915_mmio_reg_offset(fence_reg_hi);
+		*cs++ = upper_32_bits(val);
+		*cs++ = i915_mmio_reg_offset(fence_reg_lo);
+		*cs++ = lower_32_bits(val);
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i915_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	u32 val;
 
@@ -150,11 +168,26 @@  static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
 
 		I915_WRITE(reg, val);
 		POSTING_READ(reg);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
+		*cs++ = val;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
-				 struct i915_vma *vma)
+static int i830_write_fence_reg(struct drm_i915_fence_reg *fence,
+				struct i915_vma *vma,
+				struct drm_i915_gem_request *pipelined)
 {
 	u32 val;
 
@@ -182,29 +215,49 @@  static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
 
 		I915_WRITE(reg, val);
 		POSTING_READ(reg);
+	} else {
+		u32 *cs;
+
+		cs = intel_ring_begin(pipelined, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		*cs++ = i915_mmio_reg_offset(FENCE_REG(fence->id));
+		*cs++ = val;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(pipelined, cs);
 	}
+
+	return 0;
 }
 
-static void fence_write(struct drm_i915_fence_reg *fence,
-			struct i915_vma *vma)
+static int fence_write(struct drm_i915_fence_reg *fence,
+		       struct i915_vma *vma,
+		       struct drm_i915_gem_request *rq)
 {
+	int err;
+
 	/* Previous access through the fence register is marshalled by
 	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
 	 * and explicitly managed for internal users.
 	 */
 
 	if (IS_GEN2(fence->i915))
-		i830_write_fence_reg(fence, vma);
+		err = i830_write_fence_reg(fence, vma, rq);
 	else if (IS_GEN3(fence->i915))
-		i915_write_fence_reg(fence, vma);
+		err = i915_write_fence_reg(fence, vma, rq);
 	else
-		i965_write_fence_reg(fence, vma);
+		err = i965_write_fence_reg(fence, vma, rq);
+	if (err)
+		return err;
 
 	/* Access through the fenced region afterwards is
 	 * ordered by the posting reads whilst writing the registers.
 	 */
 
 	fence->dirty = false;
+	return 0;
 }
 
 static int fence_update(struct drm_i915_fence_reg *fence,
@@ -212,17 +265,15 @@  static int fence_update(struct drm_i915_fence_reg *fence,
 {
 	int ret;
 
+	ret = i915_gem_active_retire(&fence->pipelined,
+				     &fence->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
 	if (vma) {
 		if (!i915_vma_is_map_and_fenceable(vma))
 			return -EINVAL;
 
-		if (WARN(!i915_gem_object_get_stride(vma->obj) ||
-			 !i915_gem_object_get_tiling(vma->obj),
-			 "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
-			 i915_gem_object_get_stride(vma->obj),
-			 i915_gem_object_get_tiling(vma->obj)))
-			return -EINVAL;
-
 		ret = i915_gem_active_retire(&vma->last_fence,
 					     &vma->obj->base.dev->struct_mutex);
 		if (ret)
@@ -253,7 +304,7 @@  static int fence_update(struct drm_i915_fence_reg *fence,
 	 * to the runtime resume, see i915_gem_restore_fences().
 	 */
 	if (intel_runtime_pm_get_if_in_use(fence->i915)) {
-		fence_write(fence, vma);
+		fence_write(fence, vma, NULL);
 		intel_runtime_pm_put(fence->i915);
 	}
 
@@ -287,6 +338,8 @@  int i915_vma_put_fence(struct i915_vma *vma)
 	if (!fence)
 		return 0;
 
+	GEM_BUG_ON(fence->vma != vma);
+
 	if (fence->pin_count)
 		return -EBUSY;
 
@@ -344,11 +397,16 @@  i915_vma_pin_fence(struct i915_vma *vma)
 	assert_rpm_wakelock_held(vma->vm->i915);
 
 	/* Just update our place in the LRU if our fence is getting reused. */
-	if (vma->fence) {
-		fence = vma->fence;
+	fence = vma->fence;
+	if (fence) {
 		GEM_BUG_ON(fence->vma != vma);
 		fence->pin_count++;
 		if (!fence->dirty) {
+			err = i915_gem_active_retire(&fence->pipelined,
+						     &fence->i915->drm.struct_mutex);
+			if (err)
+				goto err_unpin;
+
 			list_move_tail(&fence->link,
 				       &fence->i915->mm.fence_list);
 			return 0;
@@ -372,6 +430,93 @@  i915_vma_pin_fence(struct i915_vma *vma)
 	return err;
 }
 
+int i915_vma_reserve_fence(struct i915_vma *vma)
+{
+	struct drm_i915_fence_reg *fence;
+
+	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
+	GEM_BUG_ON(!i915_vma_is_pinned(vma));
+
+	fence = vma->fence;
+	if (!fence) {
+		if (!i915_gem_object_is_tiled(vma->obj))
+			return 0;
+
+		if (!i915_vma_is_map_and_fenceable(vma))
+			return -EINVAL;
+
+		fence = fence_find(vma->vm->i915);
+		if (IS_ERR(fence))
+			return PTR_ERR(fence);
+
+		vma->fence = fence;
+
+		if (fence->vma) {
+			i915_gem_release_mmap(fence->vma->obj);
+			fence->vma->fence = NULL;
+		}
+		fence->vma = vma;
+		fence->dirty = true;
+	}
+	fence->pin_count++;
+	list_move_tail(&fence->link, &fence->i915->mm.fence_list);
+
+	GEM_BUG_ON(!i915_gem_object_is_tiled(vma->obj));
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+	GEM_BUG_ON(vma->node.size != vma->fence_size);
+	GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_alignment));
+
+	return 0;
+}
+
+int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
+				  struct drm_i915_gem_request *rq)
+{
+	struct drm_i915_fence_reg *fence = vma->fence;
+	struct drm_i915_gem_request *prev;
+	int err;
+
+	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+	GEM_BUG_ON(fence && !fence->pin_count);
+
+	if (!fence)
+		goto out;
+
+	prev = i915_gem_active_raw(&fence->pipelined,
+				   &fence->i915->drm.struct_mutex);
+	if (prev) {
+		err = i915_gem_request_await_dma_fence(rq, &prev->fence);
+		if (err)
+			return err;
+	}
+
+	if (!fence->dirty)
+		goto out;
+
+	GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
+
+	if (fence->vma) {
+		prev = i915_gem_active_raw(&fence->vma->last_fence,
+					   &fence->i915->drm.struct_mutex);
+		if (prev) {
+			err = i915_gem_request_await_dma_fence(rq,
+							       &prev->fence);
+			if (err)
+				return err;
+		}
+	}
+
+	err = fence_write(fence, vma, rq);
+	if (err)
+		return err;
+
+	i915_gem_active_set(&fence->pipelined, rq);
+out:
+	i915_gem_active_set(&vma->last_fence, rq);
+	return 0;
+}
+
 /**
  * i915_gem_revoke_fences - revoke fence state
  * @dev_priv: i915 device private
@@ -429,7 +574,7 @@  void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 			vma = NULL;
 		}
 
-		fence_write(reg, vma);
+		fence_write(reg, vma, NULL);
 		reg->vma = vma;
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..ce45972fc5c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -47,6 +47,7 @@  struct drm_i915_fence_reg {
 	 * command (such as BLT on gen2/3), as a "fence".
 	 */
 	bool dirty;
+	struct i915_gem_active pipelined;
 };
 
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index efbfee8eac99..0c489090d4ab 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -728,6 +728,7 @@  int i915_vma_unbind(struct i915_vma *vma)
 		__i915_vma_iounmap(vma);
 		vma->flags &= ~I915_VMA_CAN_FENCE;
 	}
+	GEM_BUG_ON(vma->fence);
 
 	if (likely(!vma->vm->closed)) {
 		trace_i915_vma_unbind(vma);
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 19f58af4f1bf..f0dc6eaebeab 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -367,5 +367,9 @@  i915_vma_unpin_fence(struct i915_vma *vma)
 		__i915_vma_unpin_fence(vma);
 }
 
+int __must_check i915_vma_reserve_fence(struct i915_vma *vma);
+int i915_vma_emit_pipelined_fence(struct i915_vma *vma,
+				  struct drm_i915_gem_request *rq);
+
 #endif

[3/4] drm/i915: Emit pipelined fence changes

Commit Message

Comments

Patch