drm/i915/execlists: Cache ELSP register offset
diff mbox

Message ID 20171207204503.8159-1-chris@chris-wilson.co.uk
State New
Headers show

Commit Message

Chris Wilson Dec. 7, 2017, 8:45 p.m. UTC
Currently on every submission, we recalculate the ELSP register offset
for the engine, after chasing the pointers to find the iomem base. Since
this is fixed for the lifetime of the driver record the offset in the
execlists struct.

In practice the difference is negligible, it just happens to remove 27
bytes of eyesore pointer dancing from next to the hottest instruction
(which is itself due to stalling for a cache miss) in perf profiles of
the execlists_submission_tasklet().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 12 ++++++------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +++++
 2 files changed, 11 insertions(+), 6 deletions(-)

Comments

michel.thierry@intel.com Dec. 7, 2017, 10:19 p.m. UTC | #1
On 07/12/17 12:45, Chris Wilson wrote:
> Currently on every submission, we recalculate the ELSP register offset
> for the engine, after chasing the pointers to find the iomem base. Since
> this is fixed for the lifetime of the driver record the offset in the
> execlists struct.
> 
> In practice the difference is negligible, it just happens to remove 27
> bytes of eyesore pointer dancing from next to the hottest instruction
> (which is itself due to stalling for a cache miss) in perf profiles of
> the execlists_submission_tasklet().
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c        | 12 ++++++------
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +++++
>   2 files changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2a8160f603ab..93b5ce6307af 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -431,8 +431,7 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
>   static void execlists_submit_ports(struct intel_engine_cs *engine)
>   {
>          struct execlist_port *port = engine->execlists.port;
> -       u32 __iomem *elsp =
> -               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +       u32 __iomem *elsp = engine->execlists.elsp;
>          unsigned int n;
> 
>          for (n = execlists_num_ports(&engine->execlists); n--; ) {

Since you're moving this, probably it can now use engine->execlists.elsp 
directly (as inject_preempt_context does), i.e.:

---
@@ -431,7 +431,6 @@ static inline void elsp_write(u64 desc, u32 __iomem 
*elsp)
  static void execlists_submit_ports(struct intel_engine_cs *engine)
  {
         struct execlist_port *port = engine->execlists.port;
-       u32 __iomem *elsp = engine->execlists.elsp;
         unsigned int n;

         for (n = execlists_num_ports(&engine->execlists); n--; ) {
@@ -457,7 +456,7 @@ static void execlists_submit_ports(struct 
intel_engine_cs *engine)
                         desc = 0;
                 }

-               elsp_write(desc, elsp);
+               elsp_write(desc, engine->execlists.elsp);
         }
         execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
  }
---

Anyway,

Reviewed-by: Michel Thierry <michel.thierry@intel.com>

> @@ -496,8 +495,6 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
>   {
>          struct intel_context *ce =
>                  &engine->i915->preempt_context->engine[engine->id];
> -       u32 __iomem *elsp =
> -               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>          unsigned int n;
> 
>          GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
> @@ -510,9 +507,9 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
> 
>          GEM_TRACE("\n");
>          for (n = execlists_num_ports(&engine->execlists); --n; )
> -               elsp_write(0, elsp);
> +               elsp_write(0, engine->execlists.elsp);
> 
> -       elsp_write(ce->lrc_desc, elsp);
> +       elsp_write(ce->lrc_desc, engine->execlists.elsp);
>          execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
>   }
> 
> @@ -1509,6 +1506,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>          execlists->csb_head = -1;
>          execlists->active = 0;
> 
> +       execlists->elsp =
> +               dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +
>          /* After a GPU reset, we may have requests to replay */
>          if (execlists->first)
>                  tasklet_schedule(&execlists->tasklet);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c68ab3ead83c..183165b9b3fb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -199,6 +199,11 @@ struct intel_engine_execlists {
>           */
>          bool no_priolist;
> 
> +       /**
> +        * @elsp: the ExecList Submission Port register
> +        */
> +       u32 __iomem *elsp;
> +
>          /**
>           * @port: execlist port states
>           *
> --
> 2.15.1
Chris Wilson Dec. 7, 2017, 10:23 p.m. UTC | #2
Quoting Michel Thierry (2017-12-07 22:19:05)
> On 07/12/17 12:45, Chris Wilson wrote:
> > Currently on every submission, we recalculate the ELSP register offset
> > for the engine, after chasing the pointers to find the iomem base. Since
> > this is fixed for the lifetime of the driver record the offset in the
> > execlists struct.
> > 
> > In practice the difference is negligible, it just happens to remove 27
> > bytes of eyesore pointer dancing from next to the hottest instruction
> > (which is itself due to stalling for a cache miss) in perf profiles of
> > the execlists_submission_tasklet().
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_lrc.c        | 12 ++++++------
> >   drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +++++
> >   2 files changed, 11 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 2a8160f603ab..93b5ce6307af 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -431,8 +431,7 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
> >   static void execlists_submit_ports(struct intel_engine_cs *engine)
> >   {
> >          struct execlist_port *port = engine->execlists.port;
> > -       u32 __iomem *elsp =
> > -               engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> > +       u32 __iomem *elsp = engine->execlists.elsp;
> >          unsigned int n;
> > 
> >          for (n = execlists_num_ports(&engine->execlists); n--; ) {
> 
> Since you're moving this, probably it can now use engine->execlists.elsp 
> directly (as inject_preempt_context does), i.e.:

In fact, gcc prefers it. Odd that it generates different code at all, I
wonder what it is actually doing...
-Chris

Patch
diff mbox

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2a8160f603ab..93b5ce6307af 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -431,8 +431,7 @@  static inline void elsp_write(u64 desc, u32 __iomem *elsp)
 static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
 	struct execlist_port *port = engine->execlists.port;
-	u32 __iomem *elsp =
-		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+	u32 __iomem *elsp = engine->execlists.elsp;
 	unsigned int n;
 
 	for (n = execlists_num_ports(&engine->execlists); n--; ) {
@@ -496,8 +495,6 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 {
 	struct intel_context *ce =
 		&engine->i915->preempt_context->engine[engine->id];
-	u32 __iomem *elsp =
-		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	unsigned int n;
 
 	GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
@@ -510,9 +507,9 @@  static void inject_preempt_context(struct intel_engine_cs *engine)
 
 	GEM_TRACE("\n");
 	for (n = execlists_num_ports(&engine->execlists); --n; )
-		elsp_write(0, elsp);
+		elsp_write(0, engine->execlists.elsp);
 
-	elsp_write(ce->lrc_desc, elsp);
+	elsp_write(ce->lrc_desc, engine->execlists.elsp);
 	execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
 }
 
@@ -1509,6 +1506,9 @@  static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	execlists->csb_head = -1;
 	execlists->active = 0;
 
+	execlists->elsp =
+		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+
 	/* After a GPU reset, we may have requests to replay */
 	if (execlists->first)
 		tasklet_schedule(&execlists->tasklet);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c68ab3ead83c..183165b9b3fb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -199,6 +199,11 @@  struct intel_engine_execlists {
 	 */
 	bool no_priolist;
 
+	/**
+	 * @elsp: the ExecList Submission Port register
+	 */
+	u32 __iomem *elsp;
+
 	/**
 	 * @port: execlist port states
 	 *