[v5] drm/i915/icl: Enhanced execution list support
diff mbox

Message ID 1516226034-2805-1-git-send-email-daniele.ceraolospurio@intel.com
State New
Headers show

Commit Message

Daniele Ceraolo Spurio Jan. 17, 2018, 9:53 p.m. UTC
From: Thomas Daniel <thomas.daniel@intel.com>

Enhanced Execlists is an upgraded version of execlists which supports
up to 8 ports. The lrcs to be submitted are written to a submit queue,
which is then loaded on the HW. When writing to the ELSP register, the
lrcs are written cyclically in the queue from position 0 to position 7.
Alternatively, it is possible to write directly in the individual
positions of the queue using the ELSQ registers. To be able to re-use
all the existing code we're using the latter method and we're currently
limiting ourself to only using 2 elements.

The preemption flow is sligthly different with enhanced execlists, so
this patch turns preemption off temporarily for Gen11+ while we wait for
the new mechanism to land.

v2: Rebase.
v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
v5: Reword commit, rename regs to be closer to specs, turn off
    preemption (Daniele), reuse engine->execlists.elsp (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  5 ++++-
 drivers/gpu/drm/i915/intel_lrc.c        | 35 ++++++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_lrc.h        |  3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++--
 4 files changed, 41 insertions(+), 8 deletions(-)

Comments

Mika Kuoppala Jan. 19, 2018, 1:05 p.m. UTC | #1
Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:

> From: Thomas Daniel <thomas.daniel@intel.com>
>
> Enhanced Execlists is an upgraded version of execlists which supports
> up to 8 ports. The lrcs to be submitted are written to a submit queue,
> which is then loaded on the HW. When writing to the ELSP register, the
> lrcs are written cyclically in the queue from position 0 to position 7.
> Alternatively, it is possible to write directly in the individual
> positions of the queue using the ELSQ registers. To be able to re-use
> all the existing code we're using the latter method and we're currently
> limiting ourself to only using 2 elements.
>
> The preemption flow is sligthly different with enhanced execlists, so
> this patch turns preemption off temporarily for Gen11+ while we wait for
> the new mechanism to land.
>
> v2: Rebase.
> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
> v5: Reword commit, rename regs to be closer to specs, turn off
>     preemption (Daniele), reuse engine->execlists.elsp (Chris)
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Was going to adopt this patch from Rodrigo but you were faster.

I choose to stash the elsq and use it as a gen11 vs rest toggle:

Relevant bits:

+static inline void write_port(struct intel_engine_execlists * const execlists,
+                             unsigned int n,
+                             u64 desc)
+{
+       if (execlists->elsq)
+               gen11_elsq_write(desc, n, execlists->elsq);
+       else
+               gen8_elsp_write(desc, execlists->elsp);
+}
+
+static inline void submit_ports(struct intel_engine_execlists * const execlists)
+{
+       /* for gen11+ we need to manually load the submit queue */
+       if (execlists->elsq) {
+               struct intel_engine_cs *engine =
+                       container_of(execlists,
+                                    struct intel_engine_cs,
+                                    execlists);
+               struct drm_i915_private *dev_priv = engine->i915;
+
+               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
+       }
+}
+

...
-Mika

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  5 ++++-
>  drivers/gpu/drm/i915/intel_lrc.c        | 35 ++++++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/intel_lrc.h        |  3 +++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++--
>  4 files changed, 41 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c42015b..3163543 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2738,8 +2738,11 @@ static inline unsigned int i915_sg_segment_size(void)
>  
>  #define HAS_LOGICAL_RING_CONTEXTS(dev_priv) \
>  		((dev_priv)->info.has_logical_ring_contexts)
> +
> +/* XXX: Preemption disabled for Gen11+ until support for new flow lands */
>  #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
> -		((dev_priv)->info.has_logical_ring_preemption)
> +		((dev_priv)->info.has_logical_ring_preemption && \
> +		 INTEL_GEN(dev_priv) < 11)
>  
>  #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ff25f20..67ad7c9 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -428,11 +428,24 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
>  	writel(lower_32_bits(desc), elsp);
>  }
>  
> +static inline void elsqc_write(u64 desc, u32 __iomem *elsqc, u32 port)
> +{
> +	writel(lower_32_bits(desc), elsqc + port * 2);
> +	writel(upper_32_bits(desc), elsqc + port * 2 + 1);
> +}
> +
>  static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> +	struct drm_i915_private *dev_priv = engine->i915;
>  	struct execlist_port *port = engine->execlists.port;
>  	unsigned int n;
>  
> +	/*
> +	 * Gen11+ note: the submit queue is not cleared after being submitted
> +	 * to the HW so we need to make sure we always clean it up. This is
> +	 * currently ensured by the fact that we always write the same number
> +	 * of elsq entries, keep this in mind before changing the loop below.
> +	 */
>  	for (n = execlists_num_ports(&engine->execlists); n--; ) {
>  		struct drm_i915_gem_request *rq;
>  		unsigned int count;
> @@ -456,8 +469,16 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>  			desc = 0;
>  		}
>  
> -		elsp_write(desc, engine->execlists.elsp);
> +		if (INTEL_GEN(dev_priv) >= 11)
> +			elsqc_write(desc, engine->execlists.els, n);
> +		else
> +			elsp_write(desc, engine->execlists.els);
>  	}
> +
> +	/* for gen11+ we need to manually load the submit queue */
> +	if (INTEL_GEN(dev_priv) >= 11)
> +		I915_WRITE_FW(RING_EXECLIST_CONTROL(engine), EL_CTRL_LOAD);
> +
>  	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>  }
>  
> @@ -506,9 +527,9 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>  
>  	GEM_TRACE("%s\n", engine->name);
>  	for (n = execlists_num_ports(&engine->execlists); --n; )
> -		elsp_write(0, engine->execlists.elsp);
> +		elsp_write(0, engine->execlists.els);
>  
> -	elsp_write(ce->lrc_desc, engine->execlists.elsp);
> +	elsp_write(ce->lrc_desc, engine->execlists.els);
>  	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>  }
>  
> @@ -2016,8 +2037,12 @@ static int logical_ring_init(struct intel_engine_cs *engine)
>  	if (ret)
>  		goto error;
>  
> -	engine->execlists.elsp =
> -		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +	if (INTEL_GEN(engine->i915) >= 11)
> +		engine->execlists.els = engine->i915->regs +
> +			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
> +	else
> +		engine->execlists.els = engine->i915->regs +
> +			i915_mmio_reg_offset(RING_ELSP(engine));
>  
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 6d4f9b9..3ab4266 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -38,6 +38,9 @@
>  #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
>  #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
>  #define RING_CONTEXT_STATUS_BUF_BASE(engine)	_MMIO((engine)->mmio_base + 0x370)
> +#define RING_EXECLIST_SQ_CONTENTS(engine)	_MMIO((engine)->mmio_base + 0x510)
> +#define RING_EXECLIST_CONTROL(engine)		_MMIO((engine)->mmio_base + 0x550)
> +#define	  EL_CTRL_LOAD				(1 << 0)
>  #define RING_CONTEXT_STATUS_BUF_LO(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8)
>  #define RING_CONTEXT_STATUS_BUF_HI(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8 + 4)
>  #define RING_CONTEXT_STATUS_PTR(engine)		_MMIO((engine)->mmio_base + 0x3a0)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c5ff203..d36bb73 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -200,9 +200,11 @@ struct intel_engine_execlists {
>  	bool no_priolist;
>  
>  	/**
> -	 * @elsp: the ExecList Submission Port register
> +	 * @els: gen-specific execlist submission register
> +	 * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
> +	 * the ExecList Submission Queue Contents register array for Gen11+
>  	 */
> -	u32 __iomem *elsp;
> +	u32 __iomem *els;
>  
>  	/**
>  	 * @port: execlist port states
> -- 
> 1.9.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daniele Ceraolo Spurio Jan. 19, 2018, 4:15 p.m. UTC | #2
On 19/01/18 05:05, Mika Kuoppala wrote:
> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
> 
>> From: Thomas Daniel <thomas.daniel@intel.com>
>>
>> Enhanced Execlists is an upgraded version of execlists which supports
>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
>> which is then loaded on the HW. When writing to the ELSP register, the
>> lrcs are written cyclically in the queue from position 0 to position 7.
>> Alternatively, it is possible to write directly in the individual
>> positions of the queue using the ELSQ registers. To be able to re-use
>> all the existing code we're using the latter method and we're currently
>> limiting ourself to only using 2 elements.
>>
>> The preemption flow is sligthly different with enhanced execlists, so
>> this patch turns preemption off temporarily for Gen11+ while we wait for
>> the new mechanism to land.
>>
>> v2: Rebase.
>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
>> v5: Reword commit, rename regs to be closer to specs, turn off
>>      preemption (Daniele), reuse engine->execlists.elsp (Chris)
>>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> 
> Was going to adopt this patch from Rodrigo but you were faster.
> 
> I choose to stash the elsq and use it as a gen11 vs rest toggle:
> 
> Relevant bits:
> 
> +static inline void write_port(struct intel_engine_execlists * const execlists,
> +                             unsigned int n,
> +                             u64 desc)
> +{
> +       if (execlists->elsq)
> +               gen11_elsq_write(desc, n, execlists->elsq);
> +       else
> +               gen8_elsp_write(desc, execlists->elsp);
> +}
> +
> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
> +{
> +       /* for gen11+ we need to manually load the submit queue */
> +       if (execlists->elsq) {
> +               struct intel_engine_cs *engine =
> +                       container_of(execlists,
> +                                    struct intel_engine_cs,
> +                                    execlists);
> +               struct drm_i915_private *dev_priv = engine->i915;
> +
> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
> +       }
> +}
> +
>

I was undecided about hiding the code in sub-functions because of the 
pre-emption path. There is no need in gen11 to inject a context to 
preempt to idle, so the inject_preempt function will be pre-gen11 only 
and therefore I'd prefer to keep a direct call to elsp_write there. IMHO 
it'd be cleaner to have similar code in both places, hence the 
open-coding. This said, I'd be happy to change it like you proposed if 
there is a general preference to abstract things a bit in the shared 
path even if the pre-emption path stays different.

Regarding using execlists->elsq as a toggle, I was thinking that we 
could have a device info flag instead, so we could use it even before 
setting execlists->elsq. Any preference on this?

Thanks,
Daniele

P.S. If you want to take over feel free to send an updated patch ;)

> ...
> -Mika
> 
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h         |  5 ++++-
>>   drivers/gpu/drm/i915/intel_lrc.c        | 35 ++++++++++++++++++++++++++++-----
>>   drivers/gpu/drm/i915/intel_lrc.h        |  3 +++
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++--
>>   4 files changed, 41 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index c42015b..3163543 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2738,8 +2738,11 @@ static inline unsigned int i915_sg_segment_size(void)
>>   
>>   #define HAS_LOGICAL_RING_CONTEXTS(dev_priv) \
>>   		((dev_priv)->info.has_logical_ring_contexts)
>> +
>> +/* XXX: Preemption disabled for Gen11+ until support for new flow lands */
>>   #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
>> -		((dev_priv)->info.has_logical_ring_preemption)
>> +		((dev_priv)->info.has_logical_ring_preemption && \
>> +		 INTEL_GEN(dev_priv) < 11)
>>   
>>   #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
>>   
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index ff25f20..67ad7c9 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -428,11 +428,24 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
>>   	writel(lower_32_bits(desc), elsp);
>>   }
>>   
>> +static inline void elsqc_write(u64 desc, u32 __iomem *elsqc, u32 port)
>> +{
>> +	writel(lower_32_bits(desc), elsqc + port * 2);
>> +	writel(upper_32_bits(desc), elsqc + port * 2 + 1);
>> +}
>> +
>>   static void execlists_submit_ports(struct intel_engine_cs *engine)
>>   {
>> +	struct drm_i915_private *dev_priv = engine->i915;
>>   	struct execlist_port *port = engine->execlists.port;
>>   	unsigned int n;
>>   
>> +	/*
>> +	 * Gen11+ note: the submit queue is not cleared after being submitted
>> +	 * to the HW so we need to make sure we always clean it up. This is
>> +	 * currently ensured by the fact that we always write the same number
>> +	 * of elsq entries, keep this in mind before changing the loop below.
>> +	 */
>>   	for (n = execlists_num_ports(&engine->execlists); n--; ) {
>>   		struct drm_i915_gem_request *rq;
>>   		unsigned int count;
>> @@ -456,8 +469,16 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>   			desc = 0;
>>   		}
>>   
>> -		elsp_write(desc, engine->execlists.elsp);
>> +		if (INTEL_GEN(dev_priv) >= 11)
>> +			elsqc_write(desc, engine->execlists.els, n);
>> +		else
>> +			elsp_write(desc, engine->execlists.els);
>>   	}
>> +
>> +	/* for gen11+ we need to manually load the submit queue */
>> +	if (INTEL_GEN(dev_priv) >= 11)
>> +		I915_WRITE_FW(RING_EXECLIST_CONTROL(engine), EL_CTRL_LOAD);
>> +
>>   	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>>   }
>>   
>> @@ -506,9 +527,9 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>>   
>>   	GEM_TRACE("%s\n", engine->name);
>>   	for (n = execlists_num_ports(&engine->execlists); --n; )
>> -		elsp_write(0, engine->execlists.elsp);
>> +		elsp_write(0, engine->execlists.els);
>>   
>> -	elsp_write(ce->lrc_desc, engine->execlists.elsp);
>> +	elsp_write(ce->lrc_desc, engine->execlists.els);
>>   	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>>   }
>>   
>> @@ -2016,8 +2037,12 @@ static int logical_ring_init(struct intel_engine_cs *engine)
>>   	if (ret)
>>   		goto error;
>>   
>> -	engine->execlists.elsp =
>> -		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>> +	if (INTEL_GEN(engine->i915) >= 11)
>> +		engine->execlists.els = engine->i915->regs +
>> +			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
>> +	else
>> +		engine->execlists.els = engine->i915->regs +
>> +			i915_mmio_reg_offset(RING_ELSP(engine));
>>   
>>   	return 0;
>>   
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
>> index 6d4f9b9..3ab4266 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>> @@ -38,6 +38,9 @@
>>   #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
>>   #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
>>   #define RING_CONTEXT_STATUS_BUF_BASE(engine)	_MMIO((engine)->mmio_base + 0x370)
>> +#define RING_EXECLIST_SQ_CONTENTS(engine)	_MMIO((engine)->mmio_base + 0x510)
>> +#define RING_EXECLIST_CONTROL(engine)		_MMIO((engine)->mmio_base + 0x550)
>> +#define	  EL_CTRL_LOAD				(1 << 0)
>>   #define RING_CONTEXT_STATUS_BUF_LO(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8)
>>   #define RING_CONTEXT_STATUS_BUF_HI(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8 + 4)
>>   #define RING_CONTEXT_STATUS_PTR(engine)		_MMIO((engine)->mmio_base + 0x3a0)
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index c5ff203..d36bb73 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -200,9 +200,11 @@ struct intel_engine_execlists {
>>   	bool no_priolist;
>>   
>>   	/**
>> -	 * @elsp: the ExecList Submission Port register
>> +	 * @els: gen-specific execlist submission register
>> +	 * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
>> +	 * the ExecList Submission Queue Contents register array for Gen11+
>>   	 */
>> -	u32 __iomem *elsp;
>> +	u32 __iomem *els;
>>   
>>   	/**
>>   	 * @port: execlist port states
>> -- 
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Mika Kuoppala Jan. 22, 2018, 3:08 p.m. UTC | #3
Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:

> On 19/01/18 05:05, Mika Kuoppala wrote:
>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
>> 
>>> From: Thomas Daniel <thomas.daniel@intel.com>
>>>
>>> Enhanced Execlists is an upgraded version of execlists which supports
>>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
>>> which is then loaded on the HW. When writing to the ELSP register, the
>>> lrcs are written cyclically in the queue from position 0 to position 7.
>>> Alternatively, it is possible to write directly in the individual
>>> positions of the queue using the ELSQ registers. To be able to re-use
>>> all the existing code we're using the latter method and we're currently
>>> limiting ourself to only using 2 elements.
>>>
>>> The preemption flow is sligthly different with enhanced execlists, so
>>> this patch turns preemption off temporarily for Gen11+ while we wait for
>>> the new mechanism to land.
>>>
>>> v2: Rebase.
>>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
>>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
>>> v5: Reword commit, rename regs to be closer to specs, turn off
>>>      preemption (Daniele), reuse engine->execlists.elsp (Chris)
>>>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
>>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> 
>> Was going to adopt this patch from Rodrigo but you were faster.
>> 
>> I choose to stash the elsq and use it as a gen11 vs rest toggle:
>> 
>> Relevant bits:
>> 
>> +static inline void write_port(struct intel_engine_execlists * const execlists,
>> +                             unsigned int n,
>> +                             u64 desc)
>> +{
>> +       if (execlists->elsq)
>> +               gen11_elsq_write(desc, n, execlists->elsq);
>> +       else
>> +               gen8_elsp_write(desc, execlists->elsp);
>> +}
>> +
>> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
>> +{
>> +       /* for gen11+ we need to manually load the submit queue */
>> +       if (execlists->elsq) {
>> +               struct intel_engine_cs *engine =
>> +                       container_of(execlists,
>> +                                    struct intel_engine_cs,
>> +                                    execlists);
>> +               struct drm_i915_private *dev_priv = engine->i915;
>> +
>> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
>> +       }
>> +}
>> +
>>
>
> I was undecided about hiding the code in sub-functions because of the 
> pre-emption path. There is no need in gen11 to inject a context to 
> preempt to idle, so the inject_preempt function will be pre-gen11 only 
> and therefore I'd prefer to keep a direct call to elsp_write there. IMHO 
> it'd be cleaner to have similar code in both places, hence the 
> open-coding. This said, I'd be happy to change it like you proposed if 
> there is a general preference to abstract things a bit in the shared 
> path even if the pre-emption path stays different.
>

Please don't change. I did the more abstract version before
learning that gen11 don't need the special preempt switch.

> Regarding using execlists->elsq as a toggle, I was thinking that we 
> could have a device info flag instead, so we could use it even before 
> setting execlists->elsq. Any preference on this?

has_logical_ring_elsq? Doesn't taste bad.

> Thanks,
> Daniele
>
> P.S. If you want to take over feel free to send an updated patch ;)
>

No need to take over, I thought it was orphaned patch :)

-Mika

>> ...
>> -Mika
>> 
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h         |  5 ++++-
>>>   drivers/gpu/drm/i915/intel_lrc.c        | 35 ++++++++++++++++++++++++++++-----
>>>   drivers/gpu/drm/i915/intel_lrc.h        |  3 +++
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++--
>>>   4 files changed, 41 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index c42015b..3163543 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -2738,8 +2738,11 @@ static inline unsigned int i915_sg_segment_size(void)
>>>   
>>>   #define HAS_LOGICAL_RING_CONTEXTS(dev_priv) \
>>>   		((dev_priv)->info.has_logical_ring_contexts)
>>> +
>>> +/* XXX: Preemption disabled for Gen11+ until support for new flow lands */
>>>   #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
>>> -		((dev_priv)->info.has_logical_ring_preemption)
>>> +		((dev_priv)->info.has_logical_ring_preemption && \
>>> +		 INTEL_GEN(dev_priv) < 11)
>>>   
>>>   #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
>>>   
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index ff25f20..67ad7c9 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -428,11 +428,24 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
>>>   	writel(lower_32_bits(desc), elsp);
>>>   }
>>>   
>>> +static inline void elsqc_write(u64 desc, u32 __iomem *elsqc, u32 port)
>>> +{
>>> +	writel(lower_32_bits(desc), elsqc + port * 2);
>>> +	writel(upper_32_bits(desc), elsqc + port * 2 + 1);
>>> +}
>>> +
>>>   static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>   {
>>> +	struct drm_i915_private *dev_priv = engine->i915;
>>>   	struct execlist_port *port = engine->execlists.port;
>>>   	unsigned int n;
>>>   
>>> +	/*
>>> +	 * Gen11+ note: the submit queue is not cleared after being submitted
>>> +	 * to the HW so we need to make sure we always clean it up. This is
>>> +	 * currently ensured by the fact that we always write the same number
>>> +	 * of elsq entries, keep this in mind before changing the loop below.
>>> +	 */
>>>   	for (n = execlists_num_ports(&engine->execlists); n--; ) {
>>>   		struct drm_i915_gem_request *rq;
>>>   		unsigned int count;
>>> @@ -456,8 +469,16 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>   			desc = 0;
>>>   		}
>>>   
>>> -		elsp_write(desc, engine->execlists.elsp);
>>> +		if (INTEL_GEN(dev_priv) >= 11)
>>> +			elsqc_write(desc, engine->execlists.els, n);
>>> +		else
>>> +			elsp_write(desc, engine->execlists.els);
>>>   	}
>>> +
>>> +	/* for gen11+ we need to manually load the submit queue */
>>> +	if (INTEL_GEN(dev_priv) >= 11)
>>> +		I915_WRITE_FW(RING_EXECLIST_CONTROL(engine), EL_CTRL_LOAD);
>>> +
>>>   	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>>>   }
>>>   
>>> @@ -506,9 +527,9 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>>>   
>>>   	GEM_TRACE("%s\n", engine->name);
>>>   	for (n = execlists_num_ports(&engine->execlists); --n; )
>>> -		elsp_write(0, engine->execlists.elsp);
>>> +		elsp_write(0, engine->execlists.els);
>>>   
>>> -	elsp_write(ce->lrc_desc, engine->execlists.elsp);
>>> +	elsp_write(ce->lrc_desc, engine->execlists.els);
>>>   	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>>>   }
>>>   
>>> @@ -2016,8 +2037,12 @@ static int logical_ring_init(struct intel_engine_cs *engine)
>>>   	if (ret)
>>>   		goto error;
>>>   
>>> -	engine->execlists.elsp =
>>> -		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>>> +	if (INTEL_GEN(engine->i915) >= 11)
>>> +		engine->execlists.els = engine->i915->regs +
>>> +			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
>>> +	else
>>> +		engine->execlists.els = engine->i915->regs +
>>> +			i915_mmio_reg_offset(RING_ELSP(engine));
>>>   
>>>   	return 0;
>>>   
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
>>> index 6d4f9b9..3ab4266 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>>> @@ -38,6 +38,9 @@
>>>   #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
>>>   #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
>>>   #define RING_CONTEXT_STATUS_BUF_BASE(engine)	_MMIO((engine)->mmio_base + 0x370)
>>> +#define RING_EXECLIST_SQ_CONTENTS(engine)	_MMIO((engine)->mmio_base + 0x510)
>>> +#define RING_EXECLIST_CONTROL(engine)		_MMIO((engine)->mmio_base + 0x550)
>>> +#define	  EL_CTRL_LOAD				(1 << 0)
>>>   #define RING_CONTEXT_STATUS_BUF_LO(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8)
>>>   #define RING_CONTEXT_STATUS_BUF_HI(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8 + 4)
>>>   #define RING_CONTEXT_STATUS_PTR(engine)		_MMIO((engine)->mmio_base + 0x3a0)
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> index c5ff203..d36bb73 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> @@ -200,9 +200,11 @@ struct intel_engine_execlists {
>>>   	bool no_priolist;
>>>   
>>>   	/**
>>> -	 * @elsp: the ExecList Submission Port register
>>> +	 * @els: gen-specific execlist submission register
>>> +	 * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
>>> +	 * the ExecList Submission Queue Contents register array for Gen11+
>>>   	 */
>>> -	u32 __iomem *elsp;
>>> +	u32 __iomem *els;
>>>   
>>>   	/**
>>>   	 * @port: execlist port states
>>> -- 
>>> 1.9.1
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Jan. 22, 2018, 3:13 p.m. UTC | #4
Quoting Mika Kuoppala (2018-01-22 15:08:16)
> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
> 
> > On 19/01/18 05:05, Mika Kuoppala wrote:
> >> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
> >> 
> >>> From: Thomas Daniel <thomas.daniel@intel.com>
> >>>
> >>> Enhanced Execlists is an upgraded version of execlists which supports
> >>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
> >>> which is then loaded on the HW. When writing to the ELSP register, the
> >>> lrcs are written cyclically in the queue from position 0 to position 7.
> >>> Alternatively, it is possible to write directly in the individual
> >>> positions of the queue using the ELSQ registers. To be able to re-use
> >>> all the existing code we're using the latter method and we're currently
> >>> limiting ourself to only using 2 elements.
> >>>
> >>> The preemption flow is sligthly different with enhanced execlists, so
> >>> this patch turns preemption off temporarily for Gen11+ while we wait for
> >>> the new mechanism to land.
> >>>
> >>> v2: Rebase.
> >>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
> >>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
> >>> v5: Reword commit, rename regs to be closer to specs, turn off
> >>>      preemption (Daniele), reuse engine->execlists.elsp (Chris)
> >>>
> >>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> >>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> >>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> >> 
> >> Was going to adopt this patch from Rodrigo but you were faster.
> >> 
> >> I choose to stash the elsq and use it as a gen11 vs rest toggle:
> >> 
> >> Relevant bits:
> >> 
> >> +static inline void write_port(struct intel_engine_execlists * const execlists,
> >> +                             unsigned int n,
> >> +                             u64 desc)
> >> +{
> >> +       if (execlists->elsq)
> >> +               gen11_elsq_write(desc, n, execlists->elsq);
> >> +       else
> >> +               gen8_elsp_write(desc, execlists->elsp);
> >> +}
> >> +
> >> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
> >> +{
> >> +       /* for gen11+ we need to manually load the submit queue */
> >> +       if (execlists->elsq) {
> >> +               struct intel_engine_cs *engine =
> >> +                       container_of(execlists,
> >> +                                    struct intel_engine_cs,
> >> +                                    execlists);
> >> +               struct drm_i915_private *dev_priv = engine->i915;
> >> +
> >> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
> >> +       }
> >> +}
> >> +
> >>
> >
> > I was undecided about hiding the code in sub-functions because of the 
> > pre-emption path. There is no need in gen11 to inject a context to 
> > preempt to idle,

Really? The preempt-to-idle is so that we can sync the bookkeeping with
the pending CS interrupts. The HW doesn't require it currently either,
it's the SW that does. If you have a way to avoid that, that should be
applicable to the current code as well?
-Chris
Daniele Ceraolo Spurio Jan. 22, 2018, 4:09 p.m. UTC | #5
On 22/01/18 07:13, Chris Wilson wrote:
> Quoting Mika Kuoppala (2018-01-22 15:08:16)
>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
>>
>>> On 19/01/18 05:05, Mika Kuoppala wrote:
>>>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
>>>>
>>>>> From: Thomas Daniel <thomas.daniel@intel.com>
>>>>>
>>>>> Enhanced Execlists is an upgraded version of execlists which supports
>>>>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
>>>>> which is then loaded on the HW. When writing to the ELSP register, the
>>>>> lrcs are written cyclically in the queue from position 0 to position 7.
>>>>> Alternatively, it is possible to write directly in the individual
>>>>> positions of the queue using the ELSQ registers. To be able to re-use
>>>>> all the existing code we're using the latter method and we're currently
>>>>> limiting ourself to only using 2 elements.
>>>>>
>>>>> The preemption flow is sligthly different with enhanced execlists, so
>>>>> this patch turns preemption off temporarily for Gen11+ while we wait for
>>>>> the new mechanism to land.
>>>>>
>>>>> v2: Rebase.
>>>>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
>>>>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
>>>>> v5: Reword commit, rename regs to be closer to specs, turn off
>>>>>       preemption (Daniele), reuse engine->execlists.elsp (Chris)
>>>>>
>>>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
>>>>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>>>>
>>>> Was going to adopt this patch from Rodrigo but you were faster.
>>>>
>>>> I choose to stash the elsq and use it as a gen11 vs rest toggle:
>>>>
>>>> Relevant bits:
>>>>
>>>> +static inline void write_port(struct intel_engine_execlists * const execlists,
>>>> +                             unsigned int n,
>>>> +                             u64 desc)
>>>> +{
>>>> +       if (execlists->elsq)
>>>> +               gen11_elsq_write(desc, n, execlists->elsq);
>>>> +       else
>>>> +               gen8_elsp_write(desc, execlists->elsp);
>>>> +}
>>>> +
>>>> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
>>>> +{
>>>> +       /* for gen11+ we need to manually load the submit queue */
>>>> +       if (execlists->elsq) {
>>>> +               struct intel_engine_cs *engine =
>>>> +                       container_of(execlists,
>>>> +                                    struct intel_engine_cs,
>>>> +                                    execlists);
>>>> +               struct drm_i915_private *dev_priv = engine->i915;
>>>> +
>>>> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
>>>> +       }
>>>> +}
>>>> +
>>>>
>>>
>>> I was undecided about hiding the code in sub-functions because of the
>>> pre-emption path. There is no need in gen11 to inject a context to
>>> preempt to idle,
> 
> Really? The preempt-to-idle is so that we can sync the bookkeeping with
> the pending CS interrupts. The HW doesn't require it currently either,
> it's the SW that does. If you have a way to avoid that, that should be
> applicable to the current code as well?
> -Chris
> 

We can't avoid preempt-to-idle, we can do it in a simpler way. There is 
a bit in RING_EXECLIST_CONTROL that triggers a preemp-to-idle, without 
the need to do a ctx injection. We'll need to move preemption completion 
detection to the CSB value instead of the HWSP write, not sure about the 
impact of that on our bookkeeping.

Daniele
Chris Wilson Jan. 22, 2018, 5:32 p.m. UTC | #6
Quoting Daniele Ceraolo Spurio (2018-01-22 16:09:28)
> 
> 
> On 22/01/18 07:13, Chris Wilson wrote:
> > Quoting Mika Kuoppala (2018-01-22 15:08:16)
> >> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
> >>
> >>> On 19/01/18 05:05, Mika Kuoppala wrote:
> >>>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
> >>>>
> >>>>> From: Thomas Daniel <thomas.daniel@intel.com>
> >>>>>
> >>>>> Enhanced Execlists is an upgraded version of execlists which supports
> >>>>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
> >>>>> which is then loaded on the HW. When writing to the ELSP register, the
> >>>>> lrcs are written cyclically in the queue from position 0 to position 7.
> >>>>> Alternatively, it is possible to write directly in the individual
> >>>>> positions of the queue using the ELSQ registers. To be able to re-use
> >>>>> all the existing code we're using the latter method and we're currently
> >>>>> limiting ourself to only using 2 elements.
> >>>>>
> >>>>> The preemption flow is sligthly different with enhanced execlists, so
> >>>>> this patch turns preemption off temporarily for Gen11+ while we wait for
> >>>>> the new mechanism to land.
> >>>>>
> >>>>> v2: Rebase.
> >>>>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
> >>>>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
> >>>>> v5: Reword commit, rename regs to be closer to specs, turn off
> >>>>>       preemption (Daniele), reuse engine->execlists.elsp (Chris)
> >>>>>
> >>>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> >>>>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> >>>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> >>>>
> >>>> Was going to adopt this patch from Rodrigo but you were faster.
> >>>>
> >>>> I choose to stash the elsq and use it as a gen11 vs rest toggle:
> >>>>
> >>>> Relevant bits:
> >>>>
> >>>> +static inline void write_port(struct intel_engine_execlists * const execlists,
> >>>> +                             unsigned int n,
> >>>> +                             u64 desc)
> >>>> +{
> >>>> +       if (execlists->elsq)
> >>>> +               gen11_elsq_write(desc, n, execlists->elsq);
> >>>> +       else
> >>>> +               gen8_elsp_write(desc, execlists->elsp);
> >>>> +}
> >>>> +
> >>>> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
> >>>> +{
> >>>> +       /* for gen11+ we need to manually load the submit queue */
> >>>> +       if (execlists->elsq) {
> >>>> +               struct intel_engine_cs *engine =
> >>>> +                       container_of(execlists,
> >>>> +                                    struct intel_engine_cs,
> >>>> +                                    execlists);
> >>>> +               struct drm_i915_private *dev_priv = engine->i915;
> >>>> +
> >>>> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
> >>>> +       }
> >>>> +}
> >>>> +
> >>>>
> >>>
> >>> I was undecided about hiding the code in sub-functions because of the
> >>> pre-emption path. There is no need in gen11 to inject a context to
> >>> preempt to idle,
> > 
> > Really? The preempt-to-idle is so that we can sync the bookkeeping with
> > the pending CS interrupts. The HW doesn't require it currently either,
> > it's the SW that does. If you have a way to avoid that, that should be
> > applicable to the current code as well?
> > -Chris
> > 
> 
> We can't avoid preempt-to-idle, we can do it in a simpler way. There is 
> a bit in RING_EXECLIST_CONTROL that triggers a preemp-to-idle, without 
> the need to do a ctx injection. We'll need to move preemption completion 
> detection to the CSB value instead of the HWSP write, not sure about the 
> impact of that on our bookkeeping.

Ah, shucks :( I was hoping there's a simple way to avoid idling.

I have to ask, is it worth it? If we still have to do a CS interrupt
round trip, what's the difference? Hmm. I wonder if assume that the
preemption is nearly instantaneous (say 10us), we could short-circuit
the interrupt and poll. Falling back to interrupt if longer, and/or
resetting the GPU to guarantee latencies.
-Chris
Daniele Ceraolo Spurio Jan. 22, 2018, 9:38 p.m. UTC | #7
On 22/01/18 09:32, Chris Wilson wrote:
> Quoting Daniele Ceraolo Spurio (2018-01-22 16:09:28)
>>
>>
>> On 22/01/18 07:13, Chris Wilson wrote:
>>> Quoting Mika Kuoppala (2018-01-22 15:08:16)
>>>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
>>>>
>>>>> On 19/01/18 05:05, Mika Kuoppala wrote:
>>>>>> Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> writes:
>>>>>>
>>>>>>> From: Thomas Daniel <thomas.daniel@intel.com>
>>>>>>>
>>>>>>> Enhanced Execlists is an upgraded version of execlists which supports
>>>>>>> up to 8 ports. The lrcs to be submitted are written to a submit queue,
>>>>>>> which is then loaded on the HW. When writing to the ELSP register, the
>>>>>>> lrcs are written cyclically in the queue from position 0 to position 7.
>>>>>>> Alternatively, it is possible to write directly in the individual
>>>>>>> positions of the queue using the ELSQ registers. To be able to re-use
>>>>>>> all the existing code we're using the latter method and we're currently
>>>>>>> limiting ourself to only using 2 elements.
>>>>>>>
>>>>>>> The preemption flow is sligthly different with enhanced execlists, so
>>>>>>> this patch turns preemption off temporarily for Gen11+ while we wait for
>>>>>>> the new mechanism to land.
>>>>>>>
>>>>>>> v2: Rebase.
>>>>>>> v3: Switch from !IS_GEN11 to GEN < 11 (Daniele Ceraolo Spurio).
>>>>>>> v4: Use the elsq registers instead of elsp. (Daniele Ceraolo Spurio)
>>>>>>> v5: Reword commit, rename regs to be closer to specs, turn off
>>>>>>>        preemption (Daniele), reuse engine->execlists.elsp (Chris)
>>>>>>>
>>>>>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
>>>>>>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>>>>>>> Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>>>>>>
>>>>>> Was going to adopt this patch from Rodrigo but you were faster.
>>>>>>
>>>>>> I choose to stash the elsq and use it as a gen11 vs rest toggle:
>>>>>>
>>>>>> Relevant bits:
>>>>>>
>>>>>> +static inline void write_port(struct intel_engine_execlists * const execlists,
>>>>>> +                             unsigned int n,
>>>>>> +                             u64 desc)
>>>>>> +{
>>>>>> +       if (execlists->elsq)
>>>>>> +               gen11_elsq_write(desc, n, execlists->elsq);
>>>>>> +       else
>>>>>> +               gen8_elsp_write(desc, execlists->elsp);
>>>>>> +}
>>>>>> +
>>>>>> +static inline void submit_ports(struct intel_engine_execlists * const execlists)
>>>>>> +{
>>>>>> +       /* for gen11+ we need to manually load the submit queue */
>>>>>> +       if (execlists->elsq) {
>>>>>> +               struct intel_engine_cs *engine =
>>>>>> +                       container_of(execlists,
>>>>>> +                                    struct intel_engine_cs,
>>>>>> +                                    execlists);
>>>>>> +               struct drm_i915_private *dev_priv = engine->i915;
>>>>>> +
>>>>>> +               I915_WRITE_FW(RING_ELCR(engine), ELCR_LOAD);
>>>>>> +       }
>>>>>> +}
>>>>>> +
>>>>>>
>>>>>
>>>>> I was undecided about hiding the code in sub-functions because of the
>>>>> pre-emption path. There is no need in gen11 to inject a context to
>>>>> preempt to idle,
>>>
>>> Really? The preempt-to-idle is so that we can sync the bookkeeping with
>>> the pending CS interrupts. The HW doesn't require it currently either,
>>> it's the SW that does. If you have a way to avoid that, that should be
>>> applicable to the current code as well?
>>> -Chris
>>>
>>
>> We can't avoid preempt-to-idle, we can do it in a simpler way. There is
>> a bit in RING_EXECLIST_CONTROL that triggers a preemp-to-idle, without
>> the need to do a ctx injection. We'll need to move preemption completion
>> detection to the CSB value instead of the HWSP write, not sure about the
>> impact of that on our bookkeeping.
> 
> Ah, shucks :( I was hoping there's a simple way to avoid idling.
> 
> I have to ask, is it worth it? If we still have to do a CS interrupt
> round trip, what's the difference? Hmm. I wonder if assume that the
> preemption is nearly instantaneous (say 10us), we could short-circuit
> the interrupt and poll. Falling back to interrupt if longer, and/or
> resetting the GPU to guarantee latencies.
> -Chris
> 

The SW cost shouldn't be too bad, so having 1 less ctx switch should 
give us some some benefits. Trying a poll with a fall-back sounds nice, 
but we'll probably have to wait until we have timing info to see if the 
polling makes sense at all.

Daniele

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c42015b..3163543 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2738,8 +2738,11 @@  static inline unsigned int i915_sg_segment_size(void)
 
 #define HAS_LOGICAL_RING_CONTEXTS(dev_priv) \
 		((dev_priv)->info.has_logical_ring_contexts)
+
+/* XXX: Preemption disabled for Gen11+ until support for new flow lands */
 #define HAS_LOGICAL_RING_PREEMPTION(dev_priv) \
-		((dev_priv)->info.has_logical_ring_preemption)
+		((dev_priv)->info.has_logical_ring_preemption && \
+		 INTEL_GEN(dev_priv) < 11)
 
 #define HAS_EXECLISTS(dev_priv) HAS_LOGICAL_RING_CONTEXTS(dev_priv)
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ff25f20..67ad7c9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -428,11 +428,24 @@  static inline void elsp_write(u64 desc, u32 __iomem *elsp)
 	writel(lower_32_bits(desc), elsp);
 }
 
+static inline void elsqc_write(u64 desc, u32 __iomem *elsqc, u32 port)
+{
+	writel(lower_32_bits(desc), elsqc + port * 2);
+	writel(upper_32_bits(desc), elsqc + port * 2 + 1);
+}
+
 static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *dev_priv = engine->i915;
 	struct execlist_port *port = engine->execlists.port;
 	unsigned int n;
 
+	/*
+	 * Gen11+ note: the submit queue is not cleared after being submitted
+	 * to the HW so we need to make sure we always clean it up. This is
+	 * currently ensured by the fact that we always write the same number
+	 * of elsq entries, keep this in mind before changing the loop below.
+	 */
 	for (n = execlists_num_ports(&engine->execlists); n--; ) {
 		struct drm_i915_gem_request *rq;
 		unsigned int count;
@@ -456,8 +469,16 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 			desc = 0;
 		}
 
-		elsp_write(desc, engine->execlists.elsp);
+		if (INTEL_GEN(dev_priv) >= 11)
+			elsqc_write(desc, engine->execlists.els, n);
+		else
+			elsp_write(desc, engine->execlists.els);
 	}
+
+	/* for gen11+ we need to manually load the submit queue */
+	if (INTEL_GEN(dev_priv) >= 11)
+		I915_WRITE_FW(RING_EXECLIST_CONTROL(engine), EL_CTRL_LOAD);
+
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
 }
 
@@ -506,9 +527,9 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 
 	GEM_TRACE("%s\n", engine->name);
 	for (n = execlists_num_ports(&engine->execlists); --n; )
-		elsp_write(0, engine->execlists.elsp);
+		elsp_write(0, engine->execlists.els);
 
-	elsp_write(ce->lrc_desc, engine->execlists.elsp);
+	elsp_write(ce->lrc_desc, engine->execlists.els);
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
 }
 
@@ -2016,8 +2037,12 @@  static int logical_ring_init(struct intel_engine_cs *engine)
 	if (ret)
 		goto error;
 
-	engine->execlists.elsp =
-		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	if (INTEL_GEN(engine->i915) >= 11)
+		engine->execlists.els = engine->i915->regs +
+			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
+	else
+		engine->execlists.els = engine->i915->regs +
+			i915_mmio_reg_offset(RING_ELSP(engine));
 
 	return 0;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 6d4f9b9..3ab4266 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -38,6 +38,9 @@ 
 #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
 #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
 #define RING_CONTEXT_STATUS_BUF_BASE(engine)	_MMIO((engine)->mmio_base + 0x370)
+#define RING_EXECLIST_SQ_CONTENTS(engine)	_MMIO((engine)->mmio_base + 0x510)
+#define RING_EXECLIST_CONTROL(engine)		_MMIO((engine)->mmio_base + 0x550)
+#define	  EL_CTRL_LOAD				(1 << 0)
 #define RING_CONTEXT_STATUS_BUF_LO(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8)
 #define RING_CONTEXT_STATUS_BUF_HI(engine, i)	_MMIO((engine)->mmio_base + 0x370 + (i) * 8 + 4)
 #define RING_CONTEXT_STATUS_PTR(engine)		_MMIO((engine)->mmio_base + 0x3a0)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c5ff203..d36bb73 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -200,9 +200,11 @@  struct intel_engine_execlists {
 	bool no_priolist;
 
 	/**
-	 * @elsp: the ExecList Submission Port register
+	 * @els: gen-specific execlist submission register
+	 * set to the ExecList Submission Port (elsp) register pre-Gen11 and to
+	 * the ExecList Submission Queue Contents register array for Gen11+
 	 */
-	u32 __iomem *elsp;
+	u32 __iomem *els;
 
 	/**
 	 * @port: execlist port states