@@ -131,3 +131,9 @@ depends on DRM_I915
depends on EXPERT
source drivers/gpu/drm/i915/Kconfig.debug
endmenu
+
+menu "drm/i915 Profile Guided Optimisation"
+ visible if EXPERT
+ depends on DRM_I915
+ source drivers/gpu/drm/i915/Kconfig.profile
+endmenu
new file mode 100644
@@ -0,0 +1,26 @@
+config DRM_I915_SPIN_REQUEST_IRQ
+ int
+ default 5 # microseconds
+ help
+ Before sleeping waiting for a request (GPU operation) to complete,
+ we may spend some time polling for its completion. As the IRQ may
+ take a non-negligible time to setup, we do a short spin first to
+ check if the request will complete in the time it would have taken
+ us to enable the interrupt.
+
+ May be 0 to disable the initial spin. In practice, we estimate
+ the cost of enabling the interrupt (if currently disabled) to be
+ a few microseconds.
+
+config DRM_I915_SPIN_REQUEST_CS
+ int
+ default 2 # microseconds
+ help
+ After sleeping for a request (GPU operation) to complete, we will
+ be woken up on the completion of every request prior to the one
+ being waited on. For very short requests, going back to sleep and
+ be woken up again may add considerably to the wakeup latency. To
+ avoid incurring extra latency from the scheduler, we may choose to
+ spin prior to sleeping again.
+
+ May be 0 to disable spinning after being woken.
@@ -1223,8 +1223,32 @@ long i915_wait_request(struct drm_i915_gem_request *req,
GEM_BUG_ON(!intel_wait_has_seqno(&wait));
GEM_BUG_ON(!i915_sw_fence_signaled(&req->submit));
- /* Optimistic short spin before touching IRQs */
- if (__i915_spin_request(req, wait.seqno, state, 5))
+ /*
+ * Optimistic spin before touching IRQs.
+ *
+ * We may use a rather large value here to offset the penalty of
+ * switching away from the active task. Frequently, the client will
+ * wait upon an old swapbuffer to throttle itself to remain within a
+ * frame of the gpu. If the client is running in lockstep with the gpu,
+ * then it should not be waiting long at all, and a sleep now will incur
+ * extra scheduler latency in producing the next frame. To try to
+ * avoid adding the cost of enabling/disabling the interrupt to the
+ * short wait, we first spin to see if the request would have completed
+ * in the time taken to setup the interrupt.
+ *
+ * We need upto 5us to enable the irq, and upto 20us to hide the
+ * scheduler latency of a context switch, ignoring the secondary
+ * impacts from a context switch such as cache eviction.
+ *
+ * The scheme used for low-latency IO is called "hybrid interrupt
+ * polling". The suggestion there is to sleep until just before you
+ * expect to be woken by the device interrupt and then poll for its
+ * completion. That requires having a good predictor for the request
+ * duration, which we currently lack.
+ */
+ if (CONFIG_DRM_I915_SPIN_REQUEST_IRQ &&
+ __i915_spin_request(req, wait.seqno, state,
+ CONFIG_DRM_I915_SPIN_REQUEST_IRQ))
goto complete;
set_current_state(state);
@@ -1280,8 +1304,15 @@ long i915_wait_request(struct drm_i915_gem_request *req,
__i915_wait_request_check_and_reset(req))
continue;
- /* Only spin if we know the GPU is processing this request */
- if (__i915_spin_request(req, wait.seqno, state, 2))
+ /*
+ * A quick spin now we are on the CPU to offset the cost of
+ * context switching away (and so spin for roughly the same as
+ * the scheduler latency). We only spin if we know the GPU is
+ * processing this request, and so likely to finish shortly.
+ */
+ if (CONFIG_DRM_I915_SPIN_REQUEST_CS &&
+ __i915_spin_request(req, wait.seqno, state,
+ CONFIG_DRM_I915_SPIN_REQUEST_CS))
break;
if (!intel_wait_check_request(&wait, req)) {