diff mbox

[5/9] drm/i915/execlists: Unify CSB access pointers

Message ID 20180627210712.25428-5-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson June 27, 2018, 9:07 p.m. UTC
Following the removal of the last workarounds, the only CSB mmio access
is for the old vGPU interface. The mmio registers presented by vGPU do
not require forcewake and can be treated as ordinary volatile memory,
i.e. they behave just like the HWSP access just at a different location.
We can reduce the CSB access to a set of read/write/buffer pointers and
treat the various paths identically and not worry about forcewake.
(Forcewake is nightmare for worstcase latency, and we want to process
this all with irqsoff -- no latency allowed!)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_engine_cs.c  |  12 ---
 drivers/gpu/drm/i915/intel_lrc.c        | 116 ++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  23 +++--
 3 files changed, 65 insertions(+), 86 deletions(-)

Comments

Tvrtko Ursulin June 28, 2018, 10:02 a.m. UTC | #1
On 27/06/2018 22:07, Chris Wilson wrote:
> Following the removal of the last workarounds, the only CSB mmio access
> is for the old vGPU interface. The mmio registers presented by vGPU do
> not require forcewake and can be treated as ordinary volatile memory,
> i.e. they behave just like the HWSP access just at a different location.
> We can reduce the CSB access to a set of read/write/buffer pointers and
> treat the various paths identically and not worry about forcewake.
> (Forcewake is nightmare for worstcase latency, and we want to process
> this all with irqsoff -- no latency allowed!)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_engine_cs.c  |  12 ---
>   drivers/gpu/drm/i915/intel_lrc.c        | 116 ++++++++++--------------
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  23 +++--
>   3 files changed, 65 insertions(+), 86 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index d3264bd6e9dc..7209c22798e6 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -25,7 +25,6 @@
>   #include <drm/drm_print.h>
>   
>   #include "i915_drv.h"
> -#include "i915_vgpu.h"
>   #include "intel_ringbuffer.h"
>   #include "intel_lrc.h"
>   
> @@ -456,21 +455,10 @@ static void intel_engine_init_batch_pool(struct intel_engine_cs *engine)
>   	i915_gem_batch_pool_init(&engine->batch_pool, engine);
>   }
>   
> -static bool csb_force_mmio(struct drm_i915_private *i915)
> -{
> -	/* Older GVT emulation depends upon intercepting CSB mmio */
> -	if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
> -		return true;
> -
> -	return false;
> -}
> -
>   static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   
> -	execlists->csb_use_mmio = csb_force_mmio(engine->i915);
> -
>   	execlists->port_mask = 1;
>   	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>   	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 91656eb2f2db..368a8c74d11d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -137,6 +137,7 @@
>   #include <drm/i915_drm.h>
>   #include "i915_drv.h"
>   #include "i915_gem_render_state.h"
> +#include "i915_vgpu.h"
>   #include "intel_lrc_reg.h"
>   #include "intel_mocs.h"
>   #include "intel_workarounds.h"
> @@ -953,44 +954,23 @@ static void process_csb(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct execlist_port *port = execlists->port;
> -	struct drm_i915_private *i915 = engine->i915;
> -
> -	/* The HWSP contains a (cacheable) mirror of the CSB */
> -	const u32 *buf =
> -		&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> -	unsigned int head, tail;
> -	bool fw = false;
> +	const u32 * const buf = execlists->csb_status;
> +	u8 head, tail;
>   
>   	/* Clear before reading to catch new interrupts */
>   	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>   	smp_mb__after_atomic();
>   
> -	if (unlikely(execlists->csb_use_mmio)) {
> -		intel_uncore_forcewake_get(i915, execlists->fw_domains);
> -		fw = true;
> -
> -		buf = (u32 * __force)
> -			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> +	/* Note that csb_write, csb_status may be either in HWSP or mmio */
> +	head = execlists->csb_head;
> +	tail = READ_ONCE(*execlists->csb_write);

Under GVT when this is emulated mmio I think you need to mask and shift 
it with GEN8_CSB_WRITE_PTR.

> +	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
> +	if (unlikely(head == tail))
> +		return;
>   
> -		head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> -		tail = GEN8_CSB_WRITE_PTR(head);
> -		head = GEN8_CSB_READ_PTR(head);
> -		execlists->csb_head = head;
> -	} else {
> -		const int write_idx =
> -			intel_hws_csb_write_index(i915) -
> -			I915_HWS_CSB_BUF0_INDEX;
> +	rmb(); /* Hopefully paired with a wmb() in HW */
>   
> -		head = execlists->csb_head;
> -		tail = READ_ONCE(buf[write_idx]);
> -		rmb(); /* Hopefully paired with a wmb() in HW */
> -	}
> -	GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> -		  engine->name,
> -		  head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> -		  tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> -
> -	while (head != tail) {
> +	do {

Why convert while to do-while? Early unlikely return above handles the 
void process_csb calls? Would the same effect happen if you put unlikely 
in the while (head != tail) condition and would be simpler?

>   		struct i915_request *rq;
>   		unsigned int status;
>   		unsigned int count;
> @@ -1016,12 +996,12 @@ static void process_csb(struct intel_engine_cs *engine)
>   		 * status notifier.
>   		 */
>   
> -		status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
>   		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
>   			  engine->name, head,
> -			  status, buf[2*head + 1],
> +			  buf[2 * head + 0], buf[2 * head + 1],
>   			  execlists->active);
>   
> +		status = buf[2 * head];

Why not leave before GEM_TRACE above, as is, and then diff is smaller?

>   		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
>   			      GEN8_CTX_STATUS_PREEMPTED))
>   			execlists_set_active(execlists,
> @@ -1103,16 +1083,11 @@ static void process_csb(struct intel_engine_cs *engine)
>   		} else {
>   			port_set(port, port_pack(rq, count));
>   		}
> -	}
> +	} while (head != tail);
>   
> -	if (head != execlists->csb_head) {
> -		execlists->csb_head = head;
> -		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> -		       i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> -	}
> -
> -	if (unlikely(fw))
> -		intel_uncore_forcewake_put(i915, execlists->fw_domains);
> +	writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> +	       execlists->csb_read);
> +	execlists->csb_head = head;

I guess early return helps to avoid this, but since it is going away in 
a following patch maybe it is not worth it.

>   }
>   
>   /*
> @@ -2430,28 +2405,11 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
>   static void
>   logical_ring_setup(struct intel_engine_cs *engine)
>   {
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	enum forcewake_domains fw_domains;
> -
>   	intel_engine_setup_common(engine);
>   
>   	/* Intentionally left blank. */
>   	engine->buffer = NULL;
>   
> -	fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
> -						    RING_ELSP(engine),
> -						    FW_REG_WRITE);
> -
> -	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
> -						     RING_CONTEXT_STATUS_PTR(engine),
> -						     FW_REG_READ | FW_REG_WRITE);
> -
> -	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
> -						     RING_CONTEXT_STATUS_BUF_BASE(engine),
> -						     FW_REG_READ);
> -
> -	engine->execlists.fw_domains = fw_domains;
> -
>   	tasklet_init(&engine->execlists.tasklet,
>   		     execlists_submission_tasklet, (unsigned long)engine);
>   
> @@ -2459,34 +2417,56 @@ logical_ring_setup(struct intel_engine_cs *engine)
>   	logical_ring_default_irqs(engine);
>   }
>   
> +static bool csb_force_mmio(struct drm_i915_private *i915)
> +{
> +	/* Older GVT emulation depends upon intercepting CSB mmio */
> +	return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
> +}
> +
>   static int logical_ring_init(struct intel_engine_cs *engine)
>   {
> +	struct drm_i915_private *i915 = engine->i915;
> +	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	int ret;
>   
>   	ret = intel_engine_init_common(engine);
>   	if (ret)
>   		goto error;
>   
> -	if (HAS_LOGICAL_RING_ELSQ(engine->i915)) {
> -		engine->execlists.submit_reg = engine->i915->regs +
> +	if (HAS_LOGICAL_RING_ELSQ(i915)) {
> +		execlists->submit_reg = i915->regs +
>   			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
> -		engine->execlists.ctrl_reg = engine->i915->regs +
> +		execlists->ctrl_reg = i915->regs +
>   			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
>   	} else {
> -		engine->execlists.submit_reg = engine->i915->regs +
> +		execlists->submit_reg = i915->regs +
>   			i915_mmio_reg_offset(RING_ELSP(engine));
>   	}
>   
> -	engine->execlists.preempt_complete_status = ~0u;
> -	if (engine->i915->preempt_context) {
> +	execlists->preempt_complete_status = ~0u;
> +	if (i915->preempt_context) {
>   		struct intel_context *ce =
> -			to_intel_context(engine->i915->preempt_context, engine);
> +			to_intel_context(i915->preempt_context, engine);
>   
> -		engine->execlists.preempt_complete_status =
> +		execlists->preempt_complete_status =
>   			upper_32_bits(ce->lrc_desc);
>   	}
>   
> -	engine->execlists.csb_head = GEN8_CSB_ENTRIES - 1;
> +	execlists->csb_head = GEN8_CSB_ENTRIES - 1;
> +	execlists->csb_read =
> +		i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
> +	if (csb_force_mmio(i915)) {
> +		execlists->csb_status = (u32 __force *)
> +			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> +
> +		execlists->csb_write = (u32 __force *)execlists->csb_read;
> +	} else {
> +		execlists->csb_status =
> +			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> +
> +		execlists->csb_write =
> +			&engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
> +	}
>   
>   	return 0;
>   
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index a0bc7a8222b4..5b92c5f03e1d 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -300,19 +300,30 @@ struct intel_engine_execlists {
>   	struct rb_node *first;
>   
>   	/**
> -	 * @fw_domains: forcewake domains for irq tasklet
> +	 * @csb_head: context status buffer head
>   	 */
> -	unsigned int fw_domains;
> +	unsigned int csb_head;
>   
>   	/**
> -	 * @csb_head: context status buffer head
> +	 * @csb_read: control register for Context Switch buffer
> +	 *
> +	 * Note this register is always in mmio.
>   	 */
> -	unsigned int csb_head;
> +	u32 __iomem *csb_read;
>   
>   	/**
> -	 * @csb_use_mmio: access csb through mmio, instead of hwsp
> +	 * @csb_write: control register for Context Switch buffer
> +	 *
> +	 * Note this register may be either mmio or HWSP shadow.
> +	 */
> +	u32 *csb_write;
> +
> +	/**
> +	 * @csb_status: status array for Context Switch buffer
> +	 *
> +	 * Note these register may be either mmio or HWSP shadow.
>   	 */
> -	bool csb_use_mmio;
> +	u32 *csb_status;
>   
>   	/**
>   	 * @preempt_complete_status: expected CSB upon completing preemption
> 

Looks otherwise ok.

Regards,

Tvrtko
Chris Wilson June 28, 2018, 10:10 a.m. UTC | #2
Quoting Tvrtko Ursulin (2018-06-28 11:02:17)
> 
> On 27/06/2018 22:07, Chris Wilson wrote:
> > Following the removal of the last workarounds, the only CSB mmio access
> > is for the old vGPU interface. The mmio registers presented by vGPU do
> > not require forcewake and can be treated as ordinary volatile memory,
> > i.e. they behave just like the HWSP access just at a different location.
> > We can reduce the CSB access to a set of read/write/buffer pointers and
> > treat the various paths identically and not worry about forcewake.
> > (Forcewake is nightmare for worstcase latency, and we want to process
> > this all with irqsoff -- no latency allowed!)
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/intel_engine_cs.c  |  12 ---
> >   drivers/gpu/drm/i915/intel_lrc.c        | 116 ++++++++++--------------
> >   drivers/gpu/drm/i915/intel_ringbuffer.h |  23 +++--
> >   3 files changed, 65 insertions(+), 86 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> > index d3264bd6e9dc..7209c22798e6 100644
> > --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> > @@ -25,7 +25,6 @@
> >   #include <drm/drm_print.h>
> >   
> >   #include "i915_drv.h"
> > -#include "i915_vgpu.h"
> >   #include "intel_ringbuffer.h"
> >   #include "intel_lrc.h"
> >   
> > @@ -456,21 +455,10 @@ static void intel_engine_init_batch_pool(struct intel_engine_cs *engine)
> >       i915_gem_batch_pool_init(&engine->batch_pool, engine);
> >   }
> >   
> > -static bool csb_force_mmio(struct drm_i915_private *i915)
> > -{
> > -     /* Older GVT emulation depends upon intercepting CSB mmio */
> > -     if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
> > -             return true;
> > -
> > -     return false;
> > -}
> > -
> >   static void intel_engine_init_execlist(struct intel_engine_cs *engine)
> >   {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> >   
> > -     execlists->csb_use_mmio = csb_force_mmio(engine->i915);
> > -
> >       execlists->port_mask = 1;
> >       BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
> >       GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 91656eb2f2db..368a8c74d11d 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -137,6 +137,7 @@
> >   #include <drm/i915_drm.h>
> >   #include "i915_drv.h"
> >   #include "i915_gem_render_state.h"
> > +#include "i915_vgpu.h"
> >   #include "intel_lrc_reg.h"
> >   #include "intel_mocs.h"
> >   #include "intel_workarounds.h"
> > @@ -953,44 +954,23 @@ static void process_csb(struct intel_engine_cs *engine)
> >   {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> >       struct execlist_port *port = execlists->port;
> > -     struct drm_i915_private *i915 = engine->i915;
> > -
> > -     /* The HWSP contains a (cacheable) mirror of the CSB */
> > -     const u32 *buf =
> > -             &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
> > -     unsigned int head, tail;
> > -     bool fw = false;
> > +     const u32 * const buf = execlists->csb_status;
> > +     u8 head, tail;
> >   
> >       /* Clear before reading to catch new interrupts */
> >       clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> >       smp_mb__after_atomic();
> >   
> > -     if (unlikely(execlists->csb_use_mmio)) {
> > -             intel_uncore_forcewake_get(i915, execlists->fw_domains);
> > -             fw = true;
> > -
> > -             buf = (u32 * __force)
> > -                     (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
> > +     /* Note that csb_write, csb_status may be either in HWSP or mmio */
> > +     head = execlists->csb_head;
> > +     tail = READ_ONCE(*execlists->csb_write);
> 
> Under GVT when this is emulated mmio I think you need to mask and shift 
> it with GEN8_CSB_WRITE_PTR.

Shift is 0, mask is applied by u8.

> > +     GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
> > +     if (unlikely(head == tail))
> > +             return;
> >   
> > -             head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > -             tail = GEN8_CSB_WRITE_PTR(head);
> > -             head = GEN8_CSB_READ_PTR(head);
> > -             execlists->csb_head = head;
> > -     } else {
> > -             const int write_idx =
> > -                     intel_hws_csb_write_index(i915) -
> > -                     I915_HWS_CSB_BUF0_INDEX;
> > +     rmb(); /* Hopefully paired with a wmb() in HW */
> >   
> > -             head = execlists->csb_head;
> > -             tail = READ_ONCE(buf[write_idx]);
> > -             rmb(); /* Hopefully paired with a wmb() in HW */
> > -     }
> > -     GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> > -               engine->name,
> > -               head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> > -               tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> > -
> > -     while (head != tail) {
> > +     do {
> 
> Why convert while to do-while? Early unlikely return above handles the 
> void process_csb calls? Would the same effect happen if you put unlikely 
> in the while (head != tail) condition and would be simpler?

The earlier return to circumvent the lfence.

> >               struct i915_request *rq;
> >               unsigned int status;
> >               unsigned int count;
> > @@ -1016,12 +996,12 @@ static void process_csb(struct intel_engine_cs *engine)
> >                * status notifier.
> >                */
> >   
> > -             status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
> >               GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
> >                         engine->name, head,
> > -                       status, buf[2*head + 1],
> > +                       buf[2 * head + 0], buf[2 * head + 1],
> >                         execlists->active);
> >   
> > +             status = buf[2 * head];
> 
> Why not leave before GEM_TRACE above, as is, and then diff is smaller?

You made me correct the GEM_TRACE!

Looking at it I still prefer 2 buf[] vs the mixed local/buf[].

> >               if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
> >                             GEN8_CTX_STATUS_PREEMPTED))
> >                       execlists_set_active(execlists,
> > @@ -1103,16 +1083,11 @@ static void process_csb(struct intel_engine_cs *engine)
> >               } else {
> >                       port_set(port, port_pack(rq, count));
> >               }
> > -     }
> > +     } while (head != tail);
> >   
> > -     if (head != execlists->csb_head) {
> > -             execlists->csb_head = head;
> > -             writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > -                    i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> > -     }
> > -
> > -     if (unlikely(fw))
> > -             intel_uncore_forcewake_put(i915, execlists->fw_domains);
> > +     writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> > +            execlists->csb_read);
> > +     execlists->csb_head = head;
> 
> I guess early return helps to avoid this, but since it is going away in 
> a following patch maybe it is not worth it.

lfence!
-Chris
Tvrtko Ursulin June 28, 2018, 10:58 a.m. UTC | #3
On 28/06/2018 11:10, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-06-28 11:02:17)
>>
>> On 27/06/2018 22:07, Chris Wilson wrote:
>>> Following the removal of the last workarounds, the only CSB mmio access
>>> is for the old vGPU interface. The mmio registers presented by vGPU do
>>> not require forcewake and can be treated as ordinary volatile memory,
>>> i.e. they behave just like the HWSP access just at a different location.
>>> We can reduce the CSB access to a set of read/write/buffer pointers and
>>> treat the various paths identically and not worry about forcewake.
>>> (Forcewake is nightmare for worstcase latency, and we want to process
>>> this all with irqsoff -- no latency allowed!)
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/intel_engine_cs.c  |  12 ---
>>>    drivers/gpu/drm/i915/intel_lrc.c        | 116 ++++++++++--------------
>>>    drivers/gpu/drm/i915/intel_ringbuffer.h |  23 +++--
>>>    3 files changed, 65 insertions(+), 86 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
>>> index d3264bd6e9dc..7209c22798e6 100644
>>> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
>>> @@ -25,7 +25,6 @@
>>>    #include <drm/drm_print.h>
>>>    
>>>    #include "i915_drv.h"
>>> -#include "i915_vgpu.h"
>>>    #include "intel_ringbuffer.h"
>>>    #include "intel_lrc.h"
>>>    
>>> @@ -456,21 +455,10 @@ static void intel_engine_init_batch_pool(struct intel_engine_cs *engine)
>>>        i915_gem_batch_pool_init(&engine->batch_pool, engine);
>>>    }
>>>    
>>> -static bool csb_force_mmio(struct drm_i915_private *i915)
>>> -{
>>> -     /* Older GVT emulation depends upon intercepting CSB mmio */
>>> -     if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
>>> -             return true;
>>> -
>>> -     return false;
>>> -}
>>> -
>>>    static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>>>    {
>>>        struct intel_engine_execlists * const execlists = &engine->execlists;
>>>    
>>> -     execlists->csb_use_mmio = csb_force_mmio(engine->i915);
>>> -
>>>        execlists->port_mask = 1;
>>>        BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
>>>        GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 91656eb2f2db..368a8c74d11d 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -137,6 +137,7 @@
>>>    #include <drm/i915_drm.h>
>>>    #include "i915_drv.h"
>>>    #include "i915_gem_render_state.h"
>>> +#include "i915_vgpu.h"
>>>    #include "intel_lrc_reg.h"
>>>    #include "intel_mocs.h"
>>>    #include "intel_workarounds.h"
>>> @@ -953,44 +954,23 @@ static void process_csb(struct intel_engine_cs *engine)
>>>    {
>>>        struct intel_engine_execlists * const execlists = &engine->execlists;
>>>        struct execlist_port *port = execlists->port;
>>> -     struct drm_i915_private *i915 = engine->i915;
>>> -
>>> -     /* The HWSP contains a (cacheable) mirror of the CSB */
>>> -     const u32 *buf =
>>> -             &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
>>> -     unsigned int head, tail;
>>> -     bool fw = false;
>>> +     const u32 * const buf = execlists->csb_status;
>>> +     u8 head, tail;
>>>    
>>>        /* Clear before reading to catch new interrupts */
>>>        clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>>>        smp_mb__after_atomic();
>>>    
>>> -     if (unlikely(execlists->csb_use_mmio)) {
>>> -             intel_uncore_forcewake_get(i915, execlists->fw_domains);
>>> -             fw = true;
>>> -
>>> -             buf = (u32 * __force)
>>> -                     (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
>>> +     /* Note that csb_write, csb_status may be either in HWSP or mmio */
>>> +     head = execlists->csb_head;
>>> +     tail = READ_ONCE(*execlists->csb_write);
>>
>> Under GVT when this is emulated mmio I think you need to mask and shift
>> it with GEN8_CSB_WRITE_PTR.
> 
> Shift is 0, mask is applied by u8.

Evil. :) Put a comment please.

>>> +     GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
>>> +     if (unlikely(head == tail))
>>> +             return;
>>>    
>>> -             head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
>>> -             tail = GEN8_CSB_WRITE_PTR(head);
>>> -             head = GEN8_CSB_READ_PTR(head);
>>> -             execlists->csb_head = head;
>>> -     } else {
>>> -             const int write_idx =
>>> -                     intel_hws_csb_write_index(i915) -
>>> -                     I915_HWS_CSB_BUF0_INDEX;
>>> +     rmb(); /* Hopefully paired with a wmb() in HW */
>>>    
>>> -             head = execlists->csb_head;
>>> -             tail = READ_ONCE(buf[write_idx]);
>>> -             rmb(); /* Hopefully paired with a wmb() in HW */
>>> -     }
>>> -     GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
>>> -               engine->name,
>>> -               head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
>>> -               tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
>>> -
>>> -     while (head != tail) {
>>> +     do {
>>
>> Why convert while to do-while? Early unlikely return above handles the
>> void process_csb calls? Would the same effect happen if you put unlikely
>> in the while (head != tail) condition and would be simpler?
> 
> The earlier return to circumvent the lfence.

Oh that one.. so why it is safe to compare head and tail before the rmb 
since we need the rmb afterwards?

> 
>>>                struct i915_request *rq;
>>>                unsigned int status;
>>>                unsigned int count;
>>> @@ -1016,12 +996,12 @@ static void process_csb(struct intel_engine_cs *engine)
>>>                 * status notifier.
>>>                 */
>>>    
>>> -             status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
>>>                GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
>>>                          engine->name, head,
>>> -                       status, buf[2*head + 1],
>>> +                       buf[2 * head + 0], buf[2 * head + 1],
>>>                          execlists->active);
>>>    
>>> +             status = buf[2 * head];
>>
>> Why not leave before GEM_TRACE above, as is, and then diff is smaller?
> 
> You made me correct the GEM_TRACE!

Hm?

> Looking at it I still prefer 2 buf[] vs the mixed local/buf[].

Not touching lines which don't need touching trumps in my book. But OK.

Regards,

Tvrtko

> 
>>>                if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
>>>                              GEN8_CTX_STATUS_PREEMPTED))
>>>                        execlists_set_active(execlists,
>>> @@ -1103,16 +1083,11 @@ static void process_csb(struct intel_engine_cs *engine)
>>>                } else {
>>>                        port_set(port, port_pack(rq, count));
>>>                }
>>> -     }
>>> +     } while (head != tail);
>>>    
>>> -     if (head != execlists->csb_head) {
>>> -             execlists->csb_head = head;
>>> -             writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
>>> -                    i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
>>> -     }
>>> -
>>> -     if (unlikely(fw))
>>> -             intel_uncore_forcewake_put(i915, execlists->fw_domains);
>>> +     writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
>>> +            execlists->csb_read);
>>> +     execlists->csb_head = head;
>>
>> I guess early return helps to avoid this, but since it is going away in
>> a following patch maybe it is not worth it.
> 
> lfence!
> -Chris
>
Chris Wilson June 28, 2018, 11:05 a.m. UTC | #4
Quoting Tvrtko Ursulin (2018-06-28 11:58:26)
> On 28/06/2018 11:10, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2018-06-28 11:02:17)
> >>
> >> On 27/06/2018 22:07, Chris Wilson wrote:
> >>> +     GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
> >>> +     if (unlikely(head == tail))
> >>> +             return;
> >>>    
> >>> -             head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> >>> -             tail = GEN8_CSB_WRITE_PTR(head);
> >>> -             head = GEN8_CSB_READ_PTR(head);
> >>> -             execlists->csb_head = head;
> >>> -     } else {
> >>> -             const int write_idx =
> >>> -                     intel_hws_csb_write_index(i915) -
> >>> -                     I915_HWS_CSB_BUF0_INDEX;
> >>> +     rmb(); /* Hopefully paired with a wmb() in HW */
> >>>    
> >>> -             head = execlists->csb_head;
> >>> -             tail = READ_ONCE(buf[write_idx]);
> >>> -             rmb(); /* Hopefully paired with a wmb() in HW */
> >>> -     }
> >>> -     GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
> >>> -               engine->name,
> >>> -               head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
> >>> -               tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
> >>> -
> >>> -     while (head != tail) {
> >>> +     do {
> >>
> >> Why convert while to do-while? Early unlikely return above handles the
> >> void process_csb calls? Would the same effect happen if you put unlikely
> >> in the while (head != tail) condition and would be simpler?
> > 
> > The earlier return to circumvent the lfence.
> 
> Oh that one.. so why it is safe to compare head and tail before the rmb 
> since we need the rmb afterwards?

There's a direct data dependency in the control flow of using the
volatile read from HWSP, but later on there is no data dependencies
between the write pointer we've "already" used and the rest of the CSB[]
we want to pull from HWSP. So while it seems unlikely, without that
lfence those later reads could be performed before the test (but the
test itself can't be performed before the read!). And sadly we can't get
away with our older read_barrier(), which for a long time I thought was
sufficient to order the read of the write pointer before the reads of
CSB[].
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index d3264bd6e9dc..7209c22798e6 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,7 +25,6 @@ 
 #include <drm/drm_print.h>
 
 #include "i915_drv.h"
-#include "i915_vgpu.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
 
@@ -456,21 +455,10 @@  static void intel_engine_init_batch_pool(struct intel_engine_cs *engine)
 	i915_gem_batch_pool_init(&engine->batch_pool, engine);
 }
 
-static bool csb_force_mmio(struct drm_i915_private *i915)
-{
-	/* Older GVT emulation depends upon intercepting CSB mmio */
-	if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
-		return true;
-
-	return false;
-}
-
 static void intel_engine_init_execlist(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 
-	execlists->csb_use_mmio = csb_force_mmio(engine->i915);
-
 	execlists->port_mask = 1;
 	BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
 	GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 91656eb2f2db..368a8c74d11d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -137,6 +137,7 @@ 
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_vgpu.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
 #include "intel_workarounds.h"
@@ -953,44 +954,23 @@  static void process_csb(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
-	struct drm_i915_private *i915 = engine->i915;
-
-	/* The HWSP contains a (cacheable) mirror of the CSB */
-	const u32 *buf =
-		&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
-	unsigned int head, tail;
-	bool fw = false;
+	const u32 * const buf = execlists->csb_status;
+	u8 head, tail;
 
 	/* Clear before reading to catch new interrupts */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 	smp_mb__after_atomic();
 
-	if (unlikely(execlists->csb_use_mmio)) {
-		intel_uncore_forcewake_get(i915, execlists->fw_domains);
-		fw = true;
-
-		buf = (u32 * __force)
-			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
+	/* Note that csb_write, csb_status may be either in HWSP or mmio */
+	head = execlists->csb_head;
+	tail = READ_ONCE(*execlists->csb_write);
+	GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
+	if (unlikely(head == tail))
+		return;
 
-		head = readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-		tail = GEN8_CSB_WRITE_PTR(head);
-		head = GEN8_CSB_READ_PTR(head);
-		execlists->csb_head = head;
-	} else {
-		const int write_idx =
-			intel_hws_csb_write_index(i915) -
-			I915_HWS_CSB_BUF0_INDEX;
+	rmb(); /* Hopefully paired with a wmb() in HW */
 
-		head = execlists->csb_head;
-		tail = READ_ONCE(buf[write_idx]);
-		rmb(); /* Hopefully paired with a wmb() in HW */
-	}
-	GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
-		  engine->name,
-		  head, GEN8_CSB_READ_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?",
-		  tail, GEN8_CSB_WRITE_PTR(readl(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)))), fw ? "" : "?");
-
-	while (head != tail) {
+	do {
 		struct i915_request *rq;
 		unsigned int status;
 		unsigned int count;
@@ -1016,12 +996,12 @@  static void process_csb(struct intel_engine_cs *engine)
 		 * status notifier.
 		 */
 
-		status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
 		GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
 			  engine->name, head,
-			  status, buf[2*head + 1],
+			  buf[2 * head + 0], buf[2 * head + 1],
 			  execlists->active);
 
+		status = buf[2 * head];
 		if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
 			      GEN8_CTX_STATUS_PREEMPTED))
 			execlists_set_active(execlists,
@@ -1103,16 +1083,11 @@  static void process_csb(struct intel_engine_cs *engine)
 		} else {
 			port_set(port, port_pack(rq, count));
 		}
-	}
+	} while (head != tail);
 
-	if (head != execlists->csb_head) {
-		execlists->csb_head = head;
-		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
-		       i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
-	}
-
-	if (unlikely(fw))
-		intel_uncore_forcewake_put(i915, execlists->fw_domains);
+	writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
+	       execlists->csb_read);
+	execlists->csb_head = head;
 }
 
 /*
@@ -2430,28 +2405,11 @@  logical_ring_default_irqs(struct intel_engine_cs *engine)
 static void
 logical_ring_setup(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-	enum forcewake_domains fw_domains;
-
 	intel_engine_setup_common(engine);
 
 	/* Intentionally left blank. */
 	engine->buffer = NULL;
 
-	fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
-						    RING_ELSP(engine),
-						    FW_REG_WRITE);
-
-	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
-						     RING_CONTEXT_STATUS_PTR(engine),
-						     FW_REG_READ | FW_REG_WRITE);
-
-	fw_domains |= intel_uncore_forcewake_for_reg(dev_priv,
-						     RING_CONTEXT_STATUS_BUF_BASE(engine),
-						     FW_REG_READ);
-
-	engine->execlists.fw_domains = fw_domains;
-
 	tasklet_init(&engine->execlists.tasklet,
 		     execlists_submission_tasklet, (unsigned long)engine);
 
@@ -2459,34 +2417,56 @@  logical_ring_setup(struct intel_engine_cs *engine)
 	logical_ring_default_irqs(engine);
 }
 
+static bool csb_force_mmio(struct drm_i915_private *i915)
+{
+	/* Older GVT emulation depends upon intercepting CSB mmio */
+	return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
+}
+
 static int logical_ring_init(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *i915 = engine->i915;
+	struct intel_engine_execlists * const execlists = &engine->execlists;
 	int ret;
 
 	ret = intel_engine_init_common(engine);
 	if (ret)
 		goto error;
 
-	if (HAS_LOGICAL_RING_ELSQ(engine->i915)) {
-		engine->execlists.submit_reg = engine->i915->regs +
+	if (HAS_LOGICAL_RING_ELSQ(i915)) {
+		execlists->submit_reg = i915->regs +
 			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
-		engine->execlists.ctrl_reg = engine->i915->regs +
+		execlists->ctrl_reg = i915->regs +
 			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
 	} else {
-		engine->execlists.submit_reg = engine->i915->regs +
+		execlists->submit_reg = i915->regs +
 			i915_mmio_reg_offset(RING_ELSP(engine));
 	}
 
-	engine->execlists.preempt_complete_status = ~0u;
-	if (engine->i915->preempt_context) {
+	execlists->preempt_complete_status = ~0u;
+	if (i915->preempt_context) {
 		struct intel_context *ce =
-			to_intel_context(engine->i915->preempt_context, engine);
+			to_intel_context(i915->preempt_context, engine);
 
-		engine->execlists.preempt_complete_status =
+		execlists->preempt_complete_status =
 			upper_32_bits(ce->lrc_desc);
 	}
 
-	engine->execlists.csb_head = GEN8_CSB_ENTRIES - 1;
+	execlists->csb_head = GEN8_CSB_ENTRIES - 1;
+	execlists->csb_read =
+		i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
+	if (csb_force_mmio(i915)) {
+		execlists->csb_status = (u32 __force *)
+			(i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
+
+		execlists->csb_write = (u32 __force *)execlists->csb_read;
+	} else {
+		execlists->csb_status =
+			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
+
+		execlists->csb_write =
+			&engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
+	}
 
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a0bc7a8222b4..5b92c5f03e1d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -300,19 +300,30 @@  struct intel_engine_execlists {
 	struct rb_node *first;
 
 	/**
-	 * @fw_domains: forcewake domains for irq tasklet
+	 * @csb_head: context status buffer head
 	 */
-	unsigned int fw_domains;
+	unsigned int csb_head;
 
 	/**
-	 * @csb_head: context status buffer head
+	 * @csb_read: control register for Context Switch buffer
+	 *
+	 * Note this register is always in mmio.
 	 */
-	unsigned int csb_head;
+	u32 __iomem *csb_read;
 
 	/**
-	 * @csb_use_mmio: access csb through mmio, instead of hwsp
+	 * @csb_write: control register for Context Switch buffer
+	 *
+	 * Note this register may be either mmio or HWSP shadow.
+	 */
+	u32 *csb_write;
+
+	/**
+	 * @csb_status: status array for Context Switch buffer
+	 *
+	 * Note these register may be either mmio or HWSP shadow.
 	 */
-	bool csb_use_mmio;
+	u32 *csb_status;
 
 	/**
 	 * @preempt_complete_status: expected CSB upon completing preemption