diff mbox

? Fi.CI.BAT: warning for drm/i915: Move CSB MMIO reads out of the execlists lock (rev2)

Message ID 20160308101632.GD1405@nuc-i3427.alporthouse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 8, 2016, 10:16 a.m. UTC
On Thu, Mar 03, 2016 at 08:50:46PM +0000, Chris Wilson wrote:
> Yes, patch is sane, I'm just messing around with my Braswell at the
> moment and then I'll try again at getting some numbers. First glance
> said 10% in reducing latency (with a 100% throughput improvement in one
> particular small copy scenario, that I want to reprdouce and do some back
> of the envelope calculations to check that it is sane), but the machine
> (thanks to execlists) just dies as soon as I try some more interesting
> benchmarks.

I rebased the patch ontop of the execlists thread so that I could
actually use the machine...


On braswell that gives improves the nop dispatch latency by 20%

          gem:exec:latency:0: -0.35%
          gem:exec:latency:1: +4.57%
          gem:exec:latency:2: +0.07%
          gem:exec:latency:4: +18.05%
          gem:exec:latency:8: +26.97%
         gem:exec:latency:16: +20.37%
         gem:exec:latency:32: +19.91%
         gem:exec:latency:64: +24.06%
        gem:exec:latency:128: +23.75%
        gem:exec:latency:256: +24.54%
        gem:exec:latency:512: +24.30%
       gem:exec:latency:1024: +24.43%

Outside of that scenario, the changes are more or less in the noise.
Even if we look at the full round-trip latency for synchronous execution.

     gem:exec:nop:rcs:single: -2.68%
 gem:exec:nop:rcs:continuous: -2.28%
     gem:exec:nop:bcs:single: -2.31%
 gem:exec:nop:bcs:continuous: +16.64%
     gem:exec:nop:vcs:single: -6.24%
 gem:exec:nop:vcs:continuous: +3.76%
    gem:exec:nop:vecs:single: +2.56%
gem:exec:nop:vecs:continuous: +1.83%

And with any busywork on top, we lose the effect:

     gem:exec:trace:Atlantis: -0.12%
       gem:exec:trace:glamor: +2.08%
     gem:exec:trace:glxgears: +0.79%
    gem:exec:trace:OglBatch7: +0.45%
          gem:exec:trace:sna: +0.57%
gem:exec:trace:unigine-valley:+0.01%
          gem:exec:trace:uxa: +5.63%

I am hesistant to r-b something that cannot be tested since the relevant
igt simply explode on my machine both before and after the patch.
-Chris

Comments

Tvrtko Ursulin March 8, 2016, 11:08 a.m. UTC | #1
On 08/03/16 10:16, Chris Wilson wrote:
> On Thu, Mar 03, 2016 at 08:50:46PM +0000, Chris Wilson wrote:
>> Yes, patch is sane, I'm just messing around with my Braswell at the
>> moment and then I'll try again at getting some numbers. First glance
>> said 10% in reducing latency (with a 100% throughput improvement in one
>> particular small copy scenario, that I want to reprdouce and do some back
>> of the envelope calculations to check that it is sane), but the machine
>> (thanks to execlists) just dies as soon as I try some more interesting
>> benchmarks.
>
> I rebased the patch ontop of the execlists thread so that I could
> actually use the machine...
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 4d005dd..5c0a4e0 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -549,13 +549,10 @@ static int intel_execlists_submit(void *arg)
>
>   	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>   	do {
> -		u32 status;
> -		u32 status_id;
> -		u32 submit_contexts;
> -		u32 status_pointer;
>   		unsigned read_pointer, write_pointer;
> -
> -		spin_lock(&ring->execlist_lock);
> +		u32 csb[GEN8_CSB_ENTRIES][2];
> +		u32 status_pointer;
> +		unsigned i, read, submit_contexts;
>
>   		set_current_state(TASK_INTERRUPTIBLE);
>   		status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));
> @@ -563,8 +560,6 @@ static int intel_execlists_submit(void *arg)
>   		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
>   		if (read_pointer == write_pointer) {
>   			intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> -			spin_unlock(&ring->execlist_lock);
> -
>   			if (kthread_should_stop())
>   				return 0;
>
> @@ -577,37 +572,41 @@ static int intel_execlists_submit(void *arg)
>   		if (read_pointer > write_pointer)
>   			write_pointer += GEN8_CSB_ENTRIES;
>
> -		submit_contexts = 0;
> +		read = 0;
>   		while (read_pointer < write_pointer) {
> -			status = get_context_status(ring, ++read_pointer, &status_id);
> +			csb[read][0] = get_context_status(ring, ++read_pointer,
> +							  &csb[read][1]);
> +			read++;
> +		}
>
> -			if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
> -				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
> -					if (execlists_check_remove_request(ring, status_id))
> +		I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
> +			      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> +					    (write_pointer % GEN8_CSB_ENTRIES) << 8));
> +
> +		spin_lock(&ring->execlist_lock);
> +
> +		submit_contexts = 0;
> +		for (i = 0; i < read; i++) {
> +			if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
> +				if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
> +					if (execlists_check_remove_request(ring, csb[i][1]))
>   						WARN(1, "Lite Restored request removed from queue\n");
>   				} else
>   					WARN(1, "Preemption without Lite Restore\n");
>   			}
>
> -			if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
> -				      GEN8_CTX_STATUS_ELEMENT_SWITCH))
> +			if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
> +					 GEN8_CTX_STATUS_ELEMENT_SWITCH))
>   				submit_contexts +=
> -					execlists_check_remove_request(ring, status_id);
> +					execlists_check_remove_request(ring, csb[i][1]);
>   		}
>
>   		if (submit_contexts) {
>   			if (!ring->disable_lite_restore_wa ||
> -			    (status & GEN8_CTX_STATUS_ACTIVE_IDLE))
> +			    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
>   				execlists_context_unqueue__locked(ring);
>   		}
>
> -
> -		/* Update the read pointer to the old write pointer. Manual ringbuffer
> -		 * management ftw </sarcasm> */
> -		I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
> -			      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> -					    (write_pointer % GEN8_CSB_ENTRIES) << 8));
> -
>   		spin_unlock(&ring->execlist_lock);
>
>   		if (unlikely(submit_contexts > 2))
>
> On braswell that gives improves the nop dispatch latency by 20%
>
>            gem:exec:latency:0: -0.35%
>            gem:exec:latency:1: +4.57%
>            gem:exec:latency:2: +0.07%
>            gem:exec:latency:4: +18.05%
>            gem:exec:latency:8: +26.97%
>           gem:exec:latency:16: +20.37%
>           gem:exec:latency:32: +19.91%
>           gem:exec:latency:64: +24.06%
>          gem:exec:latency:128: +23.75%
>          gem:exec:latency:256: +24.54%
>          gem:exec:latency:512: +24.30%
>         gem:exec:latency:1024: +24.43%

Cool, so it looks the faster the CPU is relative to the GPU, the bigger 
the gain.

> Outside of that scenario, the changes are more or less in the noise.
> Even if we look at the full round-trip latency for synchronous execution.
>
>       gem:exec:nop:rcs:single: -2.68%
>   gem:exec:nop:rcs:continuous: -2.28%
>       gem:exec:nop:bcs:single: -2.31%
>   gem:exec:nop:bcs:continuous: +16.64%
>       gem:exec:nop:vcs:single: -6.24%
>   gem:exec:nop:vcs:continuous: +3.76%
>      gem:exec:nop:vecs:single: +2.56%
> gem:exec:nop:vecs:continuous: +1.83%

Yes I would not expect synchronous to benefit a lot. My thinking is that 
lock contention is highest (and hence benefit from reducing it) when 
someone keeps submitting batches just below the threshold of getting 
block by filling the ring buffer, but keeping it well filled.

> And with any busywork on top, we lose the effect:
>
>       gem:exec:trace:Atlantis: -0.12%
>         gem:exec:trace:glamor: +2.08%
>       gem:exec:trace:glxgears: +0.79%
>      gem:exec:trace:OglBatch7: +0.45%
>            gem:exec:trace:sna: +0.57%
> gem:exec:trace:unigine-valley:+0.01%
>            gem:exec:trace:uxa: +5.63%

Hey, at least it is not regressing anything and still has potential for 
gains if the factors align.

> I am hesistant to r-b something that cannot be tested since the relevant
> igt simply explode on my machine both before and after the patch.

Well strictly speaking you don't need to do any platform matrix testing 
for an r-b. :) But yes, the situation you are seeing on BSW needs to be 
resolved.

Regards,

Tvrtko
Chris Wilson March 8, 2016, 12:50 p.m. UTC | #2
On Tue, Mar 08, 2016 at 11:08:18AM +0000, Tvrtko Ursulin wrote:
> Well strictly speaking you don't need to do any platform matrix
> testing for an r-b. :) But yes, the situation you are seeing on BSW
> needs to be resolved.

As discussed on IRC, it does seem to be unreproducible on !bsw (the
presumption being that we cannot saturate the CPU with interrupt
handler), but the fix may just be to remove the loop from
cherryview_irq_handler. I'm kicking off a test run to see if that loop
has any merit for GEM...
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 4d005dd..5c0a4e0 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -549,13 +549,10 @@  static int intel_execlists_submit(void *arg)
 
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 	do {
-		u32 status;
-		u32 status_id;
-		u32 submit_contexts;
-		u32 status_pointer;
 		unsigned read_pointer, write_pointer;
-
-		spin_lock(&ring->execlist_lock);
+		u32 csb[GEN8_CSB_ENTRIES][2];
+		u32 status_pointer;
+		unsigned i, read, submit_contexts;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));
@@ -563,8 +560,6 @@  static int intel_execlists_submit(void *arg)
 		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
 		if (read_pointer == write_pointer) {
 			intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-			spin_unlock(&ring->execlist_lock);
-
 			if (kthread_should_stop())
 				return 0;
 
@@ -577,37 +572,41 @@  static int intel_execlists_submit(void *arg)
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;
 
-		submit_contexts = 0;
+		read = 0;
 		while (read_pointer < write_pointer) {
-			status = get_context_status(ring, ++read_pointer, &status_id);
+			csb[read][0] = get_context_status(ring, ++read_pointer,
+							  &csb[read][1]);
+			read++;
+		}
 
-			if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
-				if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-					if (execlists_check_remove_request(ring, status_id))
+		I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
+			      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+					    (write_pointer % GEN8_CSB_ENTRIES) << 8));
+
+		spin_lock(&ring->execlist_lock);
+
+		submit_contexts = 0;
+		for (i = 0; i < read; i++) {
+			if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
+				if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
+					if (execlists_check_remove_request(ring, csb[i][1]))
 						WARN(1, "Lite Restored request removed from queue\n");
 				} else
 					WARN(1, "Preemption without Lite Restore\n");
 			}
 
-			if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
-				      GEN8_CTX_STATUS_ELEMENT_SWITCH))
+			if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+					 GEN8_CTX_STATUS_ELEMENT_SWITCH))
 				submit_contexts +=
-					execlists_check_remove_request(ring, status_id);
+					execlists_check_remove_request(ring, csb[i][1]);
 		}
 
 		if (submit_contexts) {
 			if (!ring->disable_lite_restore_wa ||
-			    (status & GEN8_CTX_STATUS_ACTIVE_IDLE))
+			    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
 				execlists_context_unqueue__locked(ring);
 		}
 
-
-		/* Update the read pointer to the old write pointer. Manual ringbuffer
-		 * management ftw </sarcasm> */
-		I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
-			      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-					    (write_pointer % GEN8_CSB_ENTRIES) << 8));
-
 		spin_unlock(&ring->execlist_lock);
 
 		if (unlikely(submit_contexts > 2))