diff mbox

[1/2] drm/i915: Cache some registers in execlists hot paths

Message ID 1458746103-25849-1-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tvrtko Ursulin March 23, 2016, 3:15 p.m. UTC
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

GCC cannot optimize well calculations hidden in macros and
assigned to temporary structures. We can cache the register in
ELSP write, and refactor reading of the CSB a bit to enable
it to do a better job.

Code is still equally readable but the generated body of the
CSB read loop is 30% smaller, and since that loop runs at
least once per interrupt, which in turn can fire in tens or
hundreds thousands times per second, must be of some value.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
 drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
 2 files changed, 21 insertions(+), 16 deletions(-)

Comments

Chris Wilson March 24, 2016, 11:16 a.m. UTC | #1
On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> GCC cannot optimize well calculations hidden in macros and
> assigned to temporary structures. We can cache the register in
> ELSP write, and refactor reading of the CSB a bit to enable
> it to do a better job.
> 
> Code is still equally readable but the generated body of the
> CSB read loop is 30% smaller, and since that loop runs at
> least once per interrupt, which in turn can fire in tens or
> hundreds thousands times per second, must be of some value.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
>  drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
>  2 files changed, 21 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 3a23b9549f7b..67592f8354d6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  {
>  
>  	struct intel_engine_cs *engine = rq0->engine;
> -	struct drm_device *dev = engine->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_private *dev_priv = rq0->i915;
> +	i915_reg_t elsp_reg = RING_ELSP(engine);

If we are going to open-code it, how about going whole hog
and

/* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
 * trouble of doing so - gcc fails miserably!
 */
u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
then use writel(upper_32_bits(desc[1]), elsp);

Keeping the comment around for grepping.

>  	uint64_t desc[2];
>  
>  	if (rq1) {
> @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  	rq0->elsp_submitted++;
>  
>  	/* You must always write both descriptors in the order below. */
> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> +	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
> +	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
>  
> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> +	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
>  	/* The context is automatically loaded after the following */
> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> +	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
>  
>  	/* ELSP is a wo register, use another nearby reg for posting */
>  	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));

Observing the above, we can also kill the POSTING_READ_FW() which will
make a much bigger improvement than all of the above.

> @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
>  }
>  
>  static u32
> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> -		   u32 *context_id)
> +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
> +		   unsigned int read_pointer, u32 *context_id)
>  {
> -	struct drm_i915_private *dev_priv = engine->dev->dev_private;
>  	u32 status;
>  
>  	read_pointer %= GEN8_CSB_ENTRIES;
>  
> -	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> +	status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));

If we forgo the "convenience" interface here, could we not also improve
the readability of the code by just having the csb ringbuffer and readl?
-Chris
Tvrtko Ursulin March 24, 2016, 11:31 a.m. UTC | #2
On 24/03/16 11:16, Chris Wilson wrote:
> On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> GCC cannot optimize well calculations hidden in macros and
>> assigned to temporary structures. We can cache the register in
>> ELSP write, and refactor reading of the CSB a bit to enable
>> it to do a better job.
>>
>> Code is still equally readable but the generated body of the
>> CSB read loop is 30% smaller, and since that loop runs at
>> least once per interrupt, which in turn can fire in tens or
>> hundreds thousands times per second, must be of some value.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++-------------
>>   drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++---
>>   2 files changed, 21 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 3a23b9549f7b..67592f8354d6 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>>   {
>>
>>   	struct intel_engine_cs *engine = rq0->engine;
>> -	struct drm_device *dev = engine->dev;
>> -	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	struct drm_i915_private *dev_priv = rq0->i915;
>> +	i915_reg_t elsp_reg = RING_ELSP(engine);
>
> If we are going to open-code it, how about going whole hog
> and
>
> /* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the
>   * trouble of doing so - gcc fails miserably!
>   */
> u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> then use writel(upper_32_bits(desc[1]), elsp);
>
> Keeping the comment around for grepping.

Yeah I had that but thought it will be considered to tasteless for 
posting. :)

>>   	uint64_t desc[2];
>>
>>   	if (rq1) {
>> @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>>   	rq0->elsp_submitted++;
>>
>>   	/* You must always write both descriptors in the order below. */
>> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
>> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
>> +	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
>> +	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
>>
>> -	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
>> +	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
>>   	/* The context is automatically loaded after the following */
>> -	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
>> +	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
>>
>>   	/* ELSP is a wo register, use another nearby reg for posting */
>>   	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
>
> Observing the above, we can also kill the POSTING_READ_FW() which will
> make a much bigger improvement than all of the above.

It is a different register, why it can be removed?

>> @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
>>   }
>>
>>   static u32
>> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
>> -		   u32 *context_id)
>> +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
>> +		   unsigned int read_pointer, u32 *context_id)
>>   {
>> -	struct drm_i915_private *dev_priv = engine->dev->dev_private;
>>   	u32 status;
>>
>>   	read_pointer %= GEN8_CSB_ENTRIES;
>>
>> -	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
>> +	status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
>
> If we forgo the "convenience" interface here, could we not also improve
> the readability of the code by just having the csb ringbuffer and readl?

The same whole-hog open coding you mean like the above? It is slightly 
more efficient, not sure that is is more readable so maybe I am not 
getting exactly what you mean.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3a23b9549f7b..67592f8354d6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -361,8 +361,8 @@  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 {
 
 	struct intel_engine_cs *engine = rq0->engine;
-	struct drm_device *dev = engine->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_private *dev_priv = rq0->i915;
+	i915_reg_t elsp_reg = RING_ELSP(engine);
 	uint64_t desc[2];
 
 	if (rq1) {
@@ -376,12 +376,12 @@  static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1]));
+	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1]));
 
-	I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
+	I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0]));
 	/* The context is automatically loaded after the following */
-	I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
+	I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0]));
 
 	/* ELSP is a wo register, use another nearby reg for posting */
 	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
@@ -517,21 +517,19 @@  execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
 }
 
 static u32
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-		   u32 *context_id)
+get_context_status(struct drm_i915_private *dev_priv, u32 csb_base,
+		   unsigned int read_pointer, u32 *context_id)
 {
-	struct drm_i915_private *dev_priv = engine->dev->dev_private;
 	u32 status;
 
 	read_pointer %= GEN8_CSB_ENTRIES;
 
-	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
+	status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer));
 
 	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
 		return 0;
 
-	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
-							      read_pointer));
+	*context_id = I915_READ_FW(RING_CSB_HI(csb_base, read_pointer));
 
 	return status;
 }
@@ -548,6 +546,7 @@  void intel_lrc_irq_handler(struct intel_engine_cs *engine)
 	struct drm_i915_private *dev_priv = engine->dev->dev_private;
 	u32 status_pointer;
 	unsigned int read_pointer, write_pointer;
+	u32 csb_base = RING_CSB_BASE(engine);
 	u32 csb[GEN8_CSB_ENTRIES][2];
 	unsigned int csb_read = 0, i;
 	unsigned int submit_contexts = 0;
@@ -565,7 +564,8 @@  void intel_lrc_irq_handler(struct intel_engine_cs *engine)
 	while (read_pointer < write_pointer) {
 		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
 			break;
-		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
+		csb[csb_read][0] = get_context_status(dev_priv, csb_base,
+						      ++read_pointer,
 						      &csb[csb_read][1]);
 		csb_read++;
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index a17cb12221ba..6690d93d603f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -34,9 +34,14 @@ 
 #define	  CTX_CTRL_INHIBIT_SYN_CTX_SWITCH	(1 << 3)
 #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	(1 << 0)
 #define   CTX_CTRL_RS_CTX_ENABLE                (1 << 1)
-#define RING_CONTEXT_STATUS_BUF_LO(ring, i)	_MMIO((ring)->mmio_base + 0x370 + (i) * 8)
-#define RING_CONTEXT_STATUS_BUF_HI(ring, i)	_MMIO((ring)->mmio_base + 0x370 + (i) * 8 + 4)
-#define RING_CONTEXT_STATUS_PTR(ring)		_MMIO((ring)->mmio_base + 0x3a0)
+
+#define RING_CSB_BASE(ring)		((ring)->mmio_base + 0x370)
+#define RING_CSB_LO(csb_base, i)	_MMIO((csb_base) + (i) * 8)
+#define RING_CSB_HI(csb_base, i)	_MMIO((csb_base) + (i) * 8 + 4)
+
+#define RING_CONTEXT_STATUS_BUF_LO(ring, i)  RING_CSB_LO(RING_CSB_BASE(ring), i)
+#define RING_CONTEXT_STATUS_BUF_HI(ring, i)  RING_CSB_HI(RING_CSB_BASE(ring), i)
+#define RING_CONTEXT_STATUS_PTR(ring)	     _MMIO((ring)->mmio_base + 0x3a0)
 
 /* The docs specify that the write pointer wraps around after 5h, "After status
  * is written out to the last available status QW at offset 5h, this pointer