Message ID | 56BDAE86.3030009@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Feb 12, 2016 at 10:05:58AM +0000, Tvrtko Ursulin wrote: > > On 11/02/16 21:00, Chris Wilson wrote: > > On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote: > >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> > >> Only caller to get_context_status ensures read pointer stays in > >> range so the WARN is impossible. Also, if the WARN would be > >> triggered by a hypothetical new caller stale status would be > >> returned to them. > >> > >> Maybe it is better to wrap the pointer in the function itself > >> then to avoid both and even results in smaller code. > >> > >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> > >> --- > >> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++--------- > >> 1 file changed, 6 insertions(+), 9 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > >> index 89eb892df4ae..951f1e6af947 100644 > >> --- a/drivers/gpu/drm/i915/intel_lrc.c > >> +++ b/drivers/gpu/drm/i915/intel_lrc.c > >> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring, > >> return false; > >> } > >> > >> -static void get_context_status(struct intel_engine_cs *ring, > >> - u8 read_pointer, > >> - u32 *status, u32 *context_id) > >> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer, > >> + u32 *context_id) > >> { > >> struct drm_i915_private *dev_priv = ring->dev->dev_private; > >> > >> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES)) > >> - return; > >> + read_pointer %= GEN8_CSB_ENTRIES; > >> > >> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer)); > >> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); > > > > Micro-optimising hat says not to even do the uncached, spinlocked mmio > > read when not required. > > You mean move the forcewake grab out from elsp write to cover all mmio > reads? These two patches make the irq handler around 3.7% smaller and > moving the forcewake/uncore lock shrinks that by a 1% more. Must be > faster as well, if someone could measure it. :) If only we already have benchmarks capable of measuring such optimisations! > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index 1e7ccd0a6573..77a64008f53d 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, > rq0->elsp_submitted++; > > /* You must always write both descriptors in the order below. */ > - spin_lock(&dev_priv->uncore.lock); > - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); > I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); > I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); > > @@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, > > /* ELSP is a wo register, use another nearby reg for posting */ > POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); > - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); > - spin_unlock(&dev_priv->uncore.lock); > } > > static int execlists_update_context(struct drm_i915_gem_request *rq) > @@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer, > > read_pointer %= GEN8_CSB_ENTRIES; > > - *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); > + *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); I was also suggesting that we don't need this read in almost 25% of cases! :) > @@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request) > } > > list_add_tail(&request->execlist_link, &ring->execlist_queue); > - if (num_elements == 0) > + if (num_elements == 0) { > + spin_lock(&dev_priv->uncore.lock); We need the irq variants here. > + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); -Chris
On 12/02/16 10:21, Chris Wilson wrote: > On Fri, Feb 12, 2016 at 10:05:58AM +0000, Tvrtko Ursulin wrote: >> >> On 11/02/16 21:00, Chris Wilson wrote: >>> On Thu, Feb 11, 2016 at 06:03:09PM +0000, Tvrtko Ursulin wrote: >>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> >>>> Only caller to get_context_status ensures read pointer stays in >>>> range so the WARN is impossible. Also, if the WARN would be >>>> triggered by a hypothetical new caller stale status would be >>>> returned to them. >>>> >>>> Maybe it is better to wrap the pointer in the function itself >>>> then to avoid both and even results in smaller code. >>>> >>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> >>>> --- >>>> drivers/gpu/drm/i915/intel_lrc.c | 15 ++++++--------- >>>> 1 file changed, 6 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >>>> index 89eb892df4ae..951f1e6af947 100644 >>>> --- a/drivers/gpu/drm/i915/intel_lrc.c >>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c >>>> @@ -507,17 +507,16 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring, >>>> return false; >>>> } >>>> >>>> -static void get_context_status(struct intel_engine_cs *ring, >>>> - u8 read_pointer, >>>> - u32 *status, u32 *context_id) >>>> +static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer, >>>> + u32 *context_id) >>>> { >>>> struct drm_i915_private *dev_priv = ring->dev->dev_private; >>>> >>>> - if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES)) >>>> - return; >>>> + read_pointer %= GEN8_CSB_ENTRIES; >>>> >>>> - *status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer)); >>>> *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); >>> >>> Micro-optimising hat says not to even do the uncached, spinlocked mmio >>> read when not required. >> >> You mean move the forcewake grab out from elsp write to cover all mmio >> reads? These two patches make the irq handler around 3.7% smaller and >> moving the forcewake/uncore lock shrinks that by a 1% more. Must be >> faster as well, if someone could measure it. :) > > If only we already have benchmarks capable of measuring such > optimisations! We do? >> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c >> index 1e7ccd0a6573..77a64008f53d 100644 >> --- a/drivers/gpu/drm/i915/intel_lrc.c >> +++ b/drivers/gpu/drm/i915/intel_lrc.c >> @@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, >> rq0->elsp_submitted++; >> >> /* You must always write both descriptors in the order below. */ >> - spin_lock(&dev_priv->uncore.lock); >> - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); >> I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); >> I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); >> >> @@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, >> >> /* ELSP is a wo register, use another nearby reg for posting */ >> POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); >> - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); >> - spin_unlock(&dev_priv->uncore.lock); >> } >> >> static int execlists_update_context(struct drm_i915_gem_request *rq) >> @@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer, >> >> read_pointer %= GEN8_CSB_ENTRIES; >> >> - *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); >> + *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); > > I was also suggesting that we don't need this read in almost 25% of > cases! :) Oh right, yeah I can do that. >> @@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request) >> } >> >> list_add_tail(&request->execlist_link, &ring->execlist_queue); >> - if (num_elements == 0) >> + if (num_elements == 0) { >> + spin_lock(&dev_priv->uncore.lock); > > We need the irq variants here. No since the execlist lock further up already disables interrupts. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 1e7ccd0a6573..77a64008f53d 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -375,8 +375,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, rq0->elsp_submitted++; /* You must always write both descriptors in the order below. */ - spin_lock(&dev_priv->uncore.lock); - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1])); I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1])); @@ -386,8 +384,6 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, /* ELSP is a wo register, use another nearby reg for posting */ POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring)); - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); - spin_unlock(&dev_priv->uncore.lock); } static int execlists_update_context(struct drm_i915_gem_request *rq) @@ -513,9 +509,9 @@ static u32 get_context_status(struct intel_engine_cs *ring, u8 read_pointer, read_pointer %= GEN8_CSB_ENTRIES; - *context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); + *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer)); - return I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer)); + return I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer)); } /** @@ -535,15 +531,18 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring) u32 status_id; u32 submit_contexts = 0; - status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); + spin_lock(&ring->execlist_lock); + + spin_lock(&dev_priv->uncore.lock); + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); + + status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring)); read_pointer = ring->next_context_status_buffer; write_pointer = GEN8_CSB_WRITE_PTR(status_pointer); if (read_pointer > write_pointer) write_pointer += GEN8_CSB_ENTRIES; - spin_lock(&ring->execlist_lock); - while (read_pointer < write_pointer) { status = get_context_status(ring, ++read_pointer, &status_id); @@ -569,22 +568,26 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring) GEN8_CTX_STATUS_ACTIVE_IDLE)))) execlists_context_unqueue(ring); - spin_unlock(&ring->execlist_lock); - - if (unlikely(submit_contexts > 2)) - DRM_ERROR("More than two context complete events?\n"); - ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES; /* Update the read pointer to the old write pointer. Manual ringbuffer * management ftw </sarcasm> */ - I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), - _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, - ring->next_context_status_buffer << 8)); + I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring), + _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, + ring->next_context_status_buffer << 8)); + + intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); + spin_unlock(&dev_priv->uncore.lock); + + spin_unlock(&ring->execlist_lock); + + if (unlikely(submit_contexts > 2)) + DRM_ERROR("More than two context complete events?\n"); } static int execlists_context_queue(struct drm_i915_gem_request *request) { + struct drm_i915_private *dev_priv = request->i915; struct intel_engine_cs *ring = request->ring; struct drm_i915_gem_request *cursor; int num_elements = 0; @@ -616,9 +619,16 @@ static int execlists_context_queue(struct drm_i915_gem_request *request) } list_add_tail(&request->execlist_link, &ring->execlist_queue); - if (num_elements == 0) + if (num_elements == 0) { + spin_lock(&dev_priv->uncore.lock); + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); + execlists_context_unqueue(ring); + intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); + spin_unlock(&dev_priv->uncore.lock); + } + spin_unlock_irq(&ring->execlist_lock); return 0;