Message ID | 1458746103-25849-1-git-send-email-tvrtko.ursulin@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > > GCC cannot optimize well calculations hidden in macros and > assigned to temporary structures. We can cache the register in > ELSP write, and refactor reading of the CSB a bit to enable > it to do a better job. > > Code is still equally readable but the generated body of the > CSB read loop is 30% smaller, and since that loop runs at > least once per interrupt, which in turn can fire in tens or > hundreds thousands times per second, must be of some value. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > --- > drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++------------- > drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++--- > 2 files changed, 21 insertions(+), 16 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 3a23b9549f7b..67592f8354d6 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, > { > > struct intel_engine_cs *engine = rq0->engine; > - struct drm_device *dev = engine->dev; > - struct drm_i915_private *dev_priv = dev->dev_private; > + struct drm_i915_private *dev_priv = rq0->i915; > + i915_reg_t elsp_reg = RING_ELSP(engine); If we are going to open-code it, how about going whole hog and /* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the * trouble of doing so - gcc fails miserably! */ u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); then use writel(upper_32_bits(desc[1]), elsp); Keeping the comment around for grepping. > uint64_t desc[2]; > > if (rq1) { > @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, > rq0->elsp_submitted++; > > /* You must always write both descriptors in the order below. */ > - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); > - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); > + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1])); > + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1])); > > - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); > + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0])); > /* The context is automatically loaded after the following */ > - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); > + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0])); > > /* ELSP is a wo register, use another nearby reg for posting */ > POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); Observing the above, we can also kill the POSTING_READ_FW() which will make a much bigger improvement than all of the above. > @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id) > } > > static u32 > -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer, > - u32 *context_id) > +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base, > + unsigned int read_pointer, u32 *context_id) > { > - struct drm_i915_private *dev_priv = engine->dev->dev_private; > u32 status; > > read_pointer %= GEN8_CSB_ENTRIES; > > - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer)); > + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer)); If we forgo the "convenience" interface here, could we not also improve the readability of the code by just having the csb ringbuffer and readl? -Chris
On 24/03/16 11:16, Chris Wilson wrote: > On Wed, Mar 23, 2016 at 03:15:02PM +0000, Tvrtko Ursulin wrote: >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> >> GCC cannot optimize well calculations hidden in macros and >> assigned to temporary structures. We can cache the register in >> ELSP write, and refactor reading of the CSB a bit to enable >> it to do a better job. >> >> Code is still equally readable but the generated body of the >> CSB read loop is 30% smaller, and since that loop runs at >> least once per interrupt, which in turn can fire in tens or >> hundreds thousands times per second, must be of some value. >> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >> --- >> drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++------------- >> drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++--- >> 2 files changed, 21 insertions(+), 16 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >> index 3a23b9549f7b..67592f8354d6 100644 >> --- a/drivers/gpu/drm/i915/intel_lrc.c >> +++ b/drivers/gpu/drm/i915/intel_lrc.c >> @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, >> { >> >> struct intel_engine_cs *engine = rq0->engine; >> - struct drm_device *dev = engine->dev; >> - struct drm_i915_private *dev_priv = dev->dev_private; >> + struct drm_i915_private *dev_priv = rq0->i915; >> + i915_reg_t elsp_reg = RING_ELSP(engine); > > If we are going to open-code it, how about going whole hog > and > > /* We opencode I915_WRITE_FW(RING_ELSP(engine)) here to save gcc the > * trouble of doing so - gcc fails miserably! > */ > u32 *elsp = rq->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); > then use writel(upper_32_bits(desc[1]), elsp); > > Keeping the comment around for grepping. Yeah I had that but thought it will be considered to tasteless for posting. :) >> uint64_t desc[2]; >> >> if (rq1) { >> @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, >> rq0->elsp_submitted++; >> >> /* You must always write both descriptors in the order below. */ >> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); >> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); >> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1])); >> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1])); >> >> - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); >> + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0])); >> /* The context is automatically loaded after the following */ >> - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); >> + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0])); >> >> /* ELSP is a wo register, use another nearby reg for posting */ >> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); > > Observing the above, we can also kill the POSTING_READ_FW() which will > make a much bigger improvement than all of the above. It is a different register, why it can be removed? >> @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id) >> } >> >> static u32 >> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer, >> - u32 *context_id) >> +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base, >> + unsigned int read_pointer, u32 *context_id) >> { >> - struct drm_i915_private *dev_priv = engine->dev->dev_private; >> u32 status; >> >> read_pointer %= GEN8_CSB_ENTRIES; >> >> - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer)); >> + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer)); > > If we forgo the "convenience" interface here, could we not also improve > the readability of the code by just having the csb ringbuffer and readl? The same whole-hog open coding you mean like the above? It is slightly more efficient, not sure that is is more readable so maybe I am not getting exactly what you mean. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 3a23b9549f7b..67592f8354d6 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, { struct intel_engine_cs *engine = rq0->engine; - struct drm_device *dev = engine->dev; - struct drm_i915_private *dev_priv = dev->dev_private; + struct drm_i915_private *dev_priv = rq0->i915; + i915_reg_t elsp_reg = RING_ELSP(engine); uint64_t desc[2]; if (rq1) { @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, rq0->elsp_submitted++; /* You must always write both descriptors in the order below. */ - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1])); + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1])); - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0])); /* The context is automatically loaded after the following */ - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0])); /* ELSP is a wo register, use another nearby reg for posting */ POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id) } static u32 -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer, - u32 *context_id) +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base, + unsigned int read_pointer, u32 *context_id) { - struct drm_i915_private *dev_priv = engine->dev->dev_private; u32 status; read_pointer %= GEN8_CSB_ENTRIES; - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer)); + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer)); if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) return 0; - *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine, - read_pointer)); + *context_id = I915_READ_FW(RING_CSB_HI(csb_base, read_pointer)); return status; } @@ -548,6 +546,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->dev->dev_private; u32 status_pointer; unsigned int read_pointer, write_pointer; + u32 csb_base = RING_CSB_BASE(engine); u32 csb[GEN8_CSB_ENTRIES][2]; unsigned int csb_read = 0, i; unsigned int submit_contexts = 0; @@ -565,7 +564,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine) while (read_pointer < write_pointer) { if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES)) break; - csb[csb_read][0] = get_context_status(engine, ++read_pointer, + csb[csb_read][0] = get_context_status(dev_priv, csb_base, + ++read_pointer, &csb[csb_read][1]); csb_read++; } diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index a17cb12221ba..6690d93d603f 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -34,9 +34,14 @@ #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH (1 << 3) #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT (1 << 0) #define CTX_CTRL_RS_CTX_ENABLE (1 << 1) -#define RING_CONTEXT_STATUS_BUF_LO(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8) -#define RING_CONTEXT_STATUS_BUF_HI(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8 + 4) -#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0) + +#define RING_CSB_BASE(ring) ((ring)->mmio_base + 0x370) +#define RING_CSB_LO(csb_base, i) _MMIO((csb_base) + (i) * 8) +#define RING_CSB_HI(csb_base, i) _MMIO((csb_base) + (i) * 8 + 4) + +#define RING_CONTEXT_STATUS_BUF_LO(ring, i) RING_CSB_LO(RING_CSB_BASE(ring), i) +#define RING_CONTEXT_STATUS_BUF_HI(ring, i) RING_CSB_HI(RING_CSB_BASE(ring), i) +#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0) /* The docs specify that the write pointer wraps around after 5h, "After status * is written out to the last available status QW at offset 5h, this pointer