diff mbox

[3/6] drm/i915: Add uncore mmio api for per-context registers

Message ID 20170222163634.7079-4-robert@sixbynine.org (mailing list archive)
State New, archived
Headers show

Commit Message

Robert Bragg Feb. 22, 2017, 4:36 p.m. UTC
Since the exponent for periodic OA counter sampling is maintained in a
per-context register while we want to treat it as if it were global
state we need to be able to safely issue an mmio write to a per-context
register and affect any currently running context.

We have to take some extra care in this case and this adds a utility
api to encapsulate what's required.

Signed-off-by: Robert Bragg <robert@sixbynine.org>
---
 drivers/gpu/drm/i915/i915_drv.h     |  4 ++
 drivers/gpu/drm/i915/i915_reg.h     |  3 ++
 drivers/gpu/drm/i915/intel_uncore.c | 73 +++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+)

Comments

Chris Wilson Feb. 23, 2017, 3:35 p.m. UTC | #1
On Wed, Feb 22, 2017 at 04:36:31PM +0000, Robert Bragg wrote:
> Since the exponent for periodic OA counter sampling is maintained in a
> per-context register while we want to treat it as if it were global
> state we need to be able to safely issue an mmio write to a per-context
> register and affect any currently running context.
> 
> We have to take some extra care in this case and this adds a utility
> api to encapsulate what's required.

Been waying up the pros/cons of having this in uncore. It doesn't
attempt to be generic nor replace the existing instance, so atm I think
it might be better as local to i915_perf.
 
> Signed-off-by: Robert Bragg <robert@sixbynine.org>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  4 ++
>  drivers/gpu/drm/i915/i915_reg.h     |  3 ++
>  drivers/gpu/drm/i915/intel_uncore.c | 73 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 80 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 105b97bd34d7..c8d03a2e89cc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -718,6 +718,7 @@ intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>  			       i915_reg_t reg, unsigned int op);
>  
>  struct intel_uncore_funcs {
> +	int (*wait_for_rcs_busy)(struct drm_i915_private *dev_priv);
>  	void (*force_wake_get)(struct drm_i915_private *dev_priv,
>  							enum forcewake_domains domains);
>  	void (*force_wake_put)(struct drm_i915_private *dev_priv,
> @@ -751,6 +752,7 @@ struct intel_uncore {
>  
>  	struct intel_uncore_funcs funcs;
>  
> +	atomic_t hold_rcs_busy_count;
>  	unsigned fifo_count;
>  
>  	enum forcewake_domains fw_domains;
> @@ -3139,6 +3141,8 @@ static inline bool intel_vgpu_active(struct drm_i915_private *dev_priv)
>  {
>  	return dev_priv->vgpu.active;
>  }
> +int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv);
> +void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv);
>  
>  void
>  i915_enable_pipestat(struct drm_i915_private *dev_priv, enum pipe pipe,
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 141a5c1e3895..94d40e82edc1 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -2313,6 +2313,9 @@ enum skl_disp_power_wells {
>  #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>  #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>  
> +#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
> +#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
> +
>  /* Fuse readout registers for GT */
>  #define CHV_FUSE_GT			_MMIO(VLV_DISPLAY_BASE + 0x2168)
>  #define   CHV_FGT_DISABLE_SS0		(1 << 10)
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 441c51fd9746..06bfe5f89ac5 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1274,6 +1274,16 @@ static void intel_uncore_fw_domains_init(struct drm_i915_private *dev_priv)
>  	WARN_ON(dev_priv->uncore.fw_domains == 0);
>  }
>  
> +static int gen8_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
> +{
> +	return wait_for((I915_READ(GEN6_RCS_PWR_FSM) & 0x3f) == 0x30, 50);
> +}
> +
> +static int gen9_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
> +{
> +	return wait_for((I915_READ(GEN9_RCS_FE_FSM2) & 0x3f) == 0x30, 50);
> +}
> +
>  #define ASSIGN_FW_DOMAINS_TABLE(d) \
>  { \
>  	dev_priv->uncore.fw_domains_table = \
> @@ -1305,6 +1315,8 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>  			dev_priv->uncore.funcs.mmio_writel =
>  						gen9_decoupled_write32;
>  		}
> +		dev_priv->uncore.funcs.wait_for_rcs_busy =
> +						gen9_wait_for_rcs_busy;
>  		break;
>  	case 8:
>  		if (IS_CHERRYVIEW(dev_priv)) {
> @@ -1316,6 +1328,8 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>  			ASSIGN_WRITE_MMIO_VFUNCS(gen8);
>  			ASSIGN_READ_MMIO_VFUNCS(gen6);
>  		}
> +		dev_priv->uncore.funcs.wait_for_rcs_busy =
> +						gen8_wait_for_rcs_busy;
>  		break;
>  	case 7:
>  	case 6:
> @@ -1858,6 +1872,65 @@ intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>  	return fw_domains;
>  }
>  
> +static int hold_rcs_busy(struct drm_i915_private *dev_priv)
> +{
> +	int ret = 0;
> +
> +	if (atomic_inc_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
> +		I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
> +			   _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
> +
> +		ret = dev_priv->uncore.funcs.wait_for_rcs_busy(dev_priv);
> +	}
> +
> +	return ret;
> +}
> +
> +static void release_rcs_busy(struct drm_i915_private *dev_priv)
> +{
> +	if (!atomic_dec_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
> +		I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
> +			   _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
> +	}

atomic_inc_and_test is correct, but atomic_dec_and_test is backwards.
Besides if you just extend the irq spinlock slightly, you get:

+int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv)
+{
+       unsigned long flags;
+       i915_reg_t reg;
+       int ret;
+
+       if (INTEL_GEN(dev_priv) < 8)
+               return 0;
+
+       spin_lock_irqsave(&dev_priv->uncore.lock, flags);
+       __intel_uncore_forcewake_get(dev_priv, FORCEWAKE_RENDER);
+
+       if (!dev_priv->uncore.rcs_hold_count++)
+               I915_WRITE_FW(GEN6_RC_SLEEP_PSMI_CONTROL,
+                             _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+
+       if  (INTEL_GEN(dev_priv) == 8)
+               reg = GEN6_RCS_PWR_FSM;
+       else
+               reg = GEN9_RCS_FE_FSM2;
+
+       ret = wait_for_atomic((__raw_i915_read32(dev_priv, reg) & 0x3f) == 0x30,
+			      50);
+       if (ret)
+               __intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
+       spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
+
+       return ret;
+}
+
+void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv)
+{
+       unsigned long flags;
+
+       if (INTEL_GEN(dev_priv) < 8)
+               return;
+
+       spin_lock_irqsave(&dev_priv->uncore.lock, flags);
+
+       if (!--dev_priv->uncore.rcs_hold_count)
+               I915_WRITE_FW(GEN6_RC_SLEEP_PSMI_CONTROL,
+                             _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+       __intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
+
+       spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
+}

Note as well you need to think about serialising the wakeup, not just
the hold count.
-Chris
Robert Bragg March 23, 2017, 4:43 p.m. UTC | #2
On Thu, Feb 23, 2017 at 3:35 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Wed, Feb 22, 2017 at 04:36:31PM +0000, Robert Bragg wrote:
>> Since the exponent for periodic OA counter sampling is maintained in a
>> per-context register while we want to treat it as if it were global
>> state we need to be able to safely issue an mmio write to a per-context
>> register and affect any currently running context.
>>
>> We have to take some extra care in this case and this adds a utility
>> api to encapsulate what's required.
>
> Been waying up the pros/cons of having this in uncore. It doesn't
> attempt to be generic nor replace the existing instance, so atm I think
> it might be better as local to i915_perf.

Thanks. Can you maybe clarify for me what "the existing instance"
that's not replaced is in this context?

>
>> Signed-off-by: Robert Bragg <robert@sixbynine.org>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h     |  4 ++
>>  drivers/gpu/drm/i915/i915_reg.h     |  3 ++
>>  drivers/gpu/drm/i915/intel_uncore.c | 73 +++++++++++++++++++++++++++++++++++++
>>  3 files changed, 80 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 105b97bd34d7..c8d03a2e89cc 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -718,6 +718,7 @@ intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>>                              i915_reg_t reg, unsigned int op);
>>
>>  struct intel_uncore_funcs {
>> +     int (*wait_for_rcs_busy)(struct drm_i915_private *dev_priv);
>>       void (*force_wake_get)(struct drm_i915_private *dev_priv,
>>                                                       enum forcewake_domains domains);
>>       void (*force_wake_put)(struct drm_i915_private *dev_priv,
>> @@ -751,6 +752,7 @@ struct intel_uncore {
>>
>>       struct intel_uncore_funcs funcs;
>>
>> +     atomic_t hold_rcs_busy_count;
>>       unsigned fifo_count;
>>
>>       enum forcewake_domains fw_domains;
>> @@ -3139,6 +3141,8 @@ static inline bool intel_vgpu_active(struct drm_i915_private *dev_priv)
>>  {
>>       return dev_priv->vgpu.active;
>>  }
>> +int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv);
>> +void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv);
>>
>>  void
>>  i915_enable_pipestat(struct drm_i915_private *dev_priv, enum pipe pipe,
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index 141a5c1e3895..94d40e82edc1 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -2313,6 +2313,9 @@ enum skl_disp_power_wells {
>>  #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE      (1 << 12)
>>  #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE     (1<<10)
>>
>> +#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
>> +#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
>> +
>>  /* Fuse readout registers for GT */
>>  #define CHV_FUSE_GT                  _MMIO(VLV_DISPLAY_BASE + 0x2168)
>>  #define   CHV_FGT_DISABLE_SS0                (1 << 10)
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>> index 441c51fd9746..06bfe5f89ac5 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -1274,6 +1274,16 @@ static void intel_uncore_fw_domains_init(struct drm_i915_private *dev_priv)
>>       WARN_ON(dev_priv->uncore.fw_domains == 0);
>>  }
>>
>> +static int gen8_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
>> +{
>> +     return wait_for((I915_READ(GEN6_RCS_PWR_FSM) & 0x3f) == 0x30, 50);
>> +}
>> +
>> +static int gen9_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
>> +{
>> +     return wait_for((I915_READ(GEN9_RCS_FE_FSM2) & 0x3f) == 0x30, 50);
>> +}
>> +
>>  #define ASSIGN_FW_DOMAINS_TABLE(d) \
>>  { \
>>       dev_priv->uncore.fw_domains_table = \
>> @@ -1305,6 +1315,8 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>>                       dev_priv->uncore.funcs.mmio_writel =
>>                                               gen9_decoupled_write32;
>>               }
>> +             dev_priv->uncore.funcs.wait_for_rcs_busy =
>> +                                             gen9_wait_for_rcs_busy;
>>               break;
>>       case 8:
>>               if (IS_CHERRYVIEW(dev_priv)) {
>> @@ -1316,6 +1328,8 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>>                       ASSIGN_WRITE_MMIO_VFUNCS(gen8);
>>                       ASSIGN_READ_MMIO_VFUNCS(gen6);
>>               }
>> +             dev_priv->uncore.funcs.wait_for_rcs_busy =
>> +                                             gen8_wait_for_rcs_busy;
>>               break;
>>       case 7:
>>       case 6:
>> @@ -1858,6 +1872,65 @@ intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>>       return fw_domains;
>>  }
>>
>> +static int hold_rcs_busy(struct drm_i915_private *dev_priv)
>> +{
>> +     int ret = 0;
>> +
>> +     if (atomic_inc_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
>> +             I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>> +                        _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
>> +
>> +             ret = dev_priv->uncore.funcs.wait_for_rcs_busy(dev_priv);
>> +     }
>> +
>> +     return ret;
>> +}
>> +
>> +static void release_rcs_busy(struct drm_i915_private *dev_priv)
>> +{
>> +     if (!atomic_dec_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
>> +             I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>> +                        _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
>> +     }
>
> atomic_inc_and_test is correct, but atomic_dec_and_test is backwards.

Hmm, it's not the mirror and the first is wrong too, sorry I clearly
didn't verify this - though I vaguely recall explicitly testing the
behaviour of the wait_for_rcs_busy at some point in the past :-/

atomic_inc_and_test() is equivalent to (atomic_inc_return(v) == 0) and
atomic_inc_return() evaluates to the post-increment value. The
intention here is to only explicitly wait when the rcs_busy_count
changes from zero to one, but the first atomic_inc_and_test will fail
for that transition and then pass for each subsequent nested
hold_rcs_busy(). The second atomic_dec_and_test() check would have
been ok without that misplaced negation :-/

Adding some printk debugging to this implementation as is and then
nesting hold/release_rcs_busy() calls like:

hold_rcs_busy();
hold_rcs_busy();
release_rcs_busy();
release_rcs_busy();

I got this incorrect output:

[  205.747836] Skipping redundant wake up for per-ctx mmio
[  205.747840] Skipping redundant wake up for per-ctx mmio
[  205.747844] Explicitly relinquishing wake up hold for per-ctx mmio
[  205.747845] Not relinquishing hold for per-ctx mmio yet


Looking at this again, I think I now prefer avoiding these _and_test
macros() and instead having something like:

+if (atomic_dec_return(&dev_priv->uncore.hold_rcs_busy_count) == 0) {

I believe this fixes the ref counting control flow here:

+static int hold_rcs_busy(struct drm_i915_private *dev_priv)
+{
+    int ret = 0;
+
+    /* Only explicitly enact a hold if it's not already being held */
+    if (atomic_inc_return(&dev_priv->uncore.hold_rcs_busy_count) == 1) {
+        I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
+               _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+
+        /* If waiting didn't succeed, decrement counter to allow retries */
+        ret = dev_priv->uncore.funcs.wait_for_rcs_busy(dev_priv);
+        if (ret)
+            atomic_dec(&dev_priv->uncore.hold_rcs_busy_count);
+    }
+
+    return ret;
+}
+
+static void release_rcs_busy(struct drm_i915_private *dev_priv)
+{
+    /* Only drop the hold if nothing else needs it */
+    if (atomic_dec_return(&dev_priv->uncore.hold_rcs_busy_count) == 0) {
+        I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
+               _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+    }
+}

With the same kind of prink debugging with nested calls, I double
checked I got the expected output of:

[   68.375183] Explicitly waiting for wake up for per-ctx mmio
[   68.375190] Skipping redundant wake up for per-ctx mmio
[   68.375196] Not relinquishing hold for per-ctx mmio yet
[   68.375197] Explicitly relinquishing wake up hold for per-ctx mmio


> Besides if you just extend the irq spinlock slightly, you get:
>
> +int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv)
> +{
> +       unsigned long flags;
> +       i915_reg_t reg;
> +       int ret;
> +
> +       if (INTEL_GEN(dev_priv) < 8)
> +               return 0;
> +
> +       spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +       __intel_uncore_forcewake_get(dev_priv, FORCEWAKE_RENDER);
> +
> +       if (!dev_priv->uncore.rcs_hold_count++)
> +               I915_WRITE_FW(GEN6_RC_SLEEP_PSMI_CONTROL,
> +                             _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
> +
> +       if  (INTEL_GEN(dev_priv) == 8)
> +               reg = GEN6_RCS_PWR_FSM;
> +       else
> +               reg = GEN9_RCS_FE_FSM2;
> +
> +       ret = wait_for_atomic((__raw_i915_read32(dev_priv, reg) & 0x3f) == 0x30,
> +                             50);
> +       if (ret)
> +               __intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
> +       spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
> +
> +       return ret;
> +}

Thanks for writing this out.

With regards to your earlier comment about not being general/replacing
the existing instance, would the same still apply to something along
these lines - or you'd still suggest something along these lines would
live in i915_perf.c?

I have a couple of concerns about this atm:

I don't really know what kind of worst case latencies to expect while
waiting for the _FSM status, which makes me wonder about the
repercussions of taking the uncore.lock and updating the
implementation to be able to run in atomic context. I'd be a little
worried an OA configuration ioctl might end up blocking something much
more important.

For i915_perf.c we currently only ever need to set these while in
process context.

Another issue with my previous code, carried over here is not dropping
the rcs_hold_count if the wait_for fails.

> +
> +void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv)
> +{
> +       unsigned long flags;
> +
> +       if (INTEL_GEN(dev_priv) < 8)
> +               return;
> +
> +       spin_lock_irqsave(&dev_priv->uncore.lock, flags);
> +
> +       if (!--dev_priv->uncore.rcs_hold_count)
> +               I915_WRITE_FW(GEN6_RC_SLEEP_PSMI_CONTROL,
> +                             _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
> +       __intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
> +
> +       spin_unlock_irqrestore(&dev_priv->uncore.lock, flags);
> +}
>
> Note as well you need to think about serialising the wakeup, not just
> the hold count.

Hmm, right, if there were an overlapping _begin_ctx_mmio it could
quite possibly return (before the wake up completes) with no error
just because something else initiated a begin_ctx_mmio.

I think at this point, if I just put something specific to i915 perf
into i915_perf.c then the requirements become so much simpler.

Considering i915 perf is currently the only drm/i915 code needing to
do this, then in this case we also only ever need to do this from
process context from a single non-reentrant ioctl.

Thanks for looking at this,
- Robert

> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 105b97bd34d7..c8d03a2e89cc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -718,6 +718,7 @@  intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
 			       i915_reg_t reg, unsigned int op);
 
 struct intel_uncore_funcs {
+	int (*wait_for_rcs_busy)(struct drm_i915_private *dev_priv);
 	void (*force_wake_get)(struct drm_i915_private *dev_priv,
 							enum forcewake_domains domains);
 	void (*force_wake_put)(struct drm_i915_private *dev_priv,
@@ -751,6 +752,7 @@  struct intel_uncore {
 
 	struct intel_uncore_funcs funcs;
 
+	atomic_t hold_rcs_busy_count;
 	unsigned fifo_count;
 
 	enum forcewake_domains fw_domains;
@@ -3139,6 +3141,8 @@  static inline bool intel_vgpu_active(struct drm_i915_private *dev_priv)
 {
 	return dev_priv->vgpu.active;
 }
+int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv);
+void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv);
 
 void
 i915_enable_pipestat(struct drm_i915_private *dev_priv, enum pipe pipe,
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 141a5c1e3895..94d40e82edc1 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2313,6 +2313,9 @@  enum skl_disp_power_wells {
 #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
 #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
 
+#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
+#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
+
 /* Fuse readout registers for GT */
 #define CHV_FUSE_GT			_MMIO(VLV_DISPLAY_BASE + 0x2168)
 #define   CHV_FGT_DISABLE_SS0		(1 << 10)
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 441c51fd9746..06bfe5f89ac5 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1274,6 +1274,16 @@  static void intel_uncore_fw_domains_init(struct drm_i915_private *dev_priv)
 	WARN_ON(dev_priv->uncore.fw_domains == 0);
 }
 
+static int gen8_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
+{
+	return wait_for((I915_READ(GEN6_RCS_PWR_FSM) & 0x3f) == 0x30, 50);
+}
+
+static int gen9_wait_for_rcs_busy(struct drm_i915_private *dev_priv)
+{
+	return wait_for((I915_READ(GEN9_RCS_FE_FSM2) & 0x3f) == 0x30, 50);
+}
+
 #define ASSIGN_FW_DOMAINS_TABLE(d) \
 { \
 	dev_priv->uncore.fw_domains_table = \
@@ -1305,6 +1315,8 @@  void intel_uncore_init(struct drm_i915_private *dev_priv)
 			dev_priv->uncore.funcs.mmio_writel =
 						gen9_decoupled_write32;
 		}
+		dev_priv->uncore.funcs.wait_for_rcs_busy =
+						gen9_wait_for_rcs_busy;
 		break;
 	case 8:
 		if (IS_CHERRYVIEW(dev_priv)) {
@@ -1316,6 +1328,8 @@  void intel_uncore_init(struct drm_i915_private *dev_priv)
 			ASSIGN_WRITE_MMIO_VFUNCS(gen8);
 			ASSIGN_READ_MMIO_VFUNCS(gen6);
 		}
+		dev_priv->uncore.funcs.wait_for_rcs_busy =
+						gen8_wait_for_rcs_busy;
 		break;
 	case 7:
 	case 6:
@@ -1858,6 +1872,65 @@  intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
 	return fw_domains;
 }
 
+static int hold_rcs_busy(struct drm_i915_private *dev_priv)
+{
+	int ret = 0;
+
+	if (atomic_inc_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
+		I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
+			   _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+
+		ret = dev_priv->uncore.funcs.wait_for_rcs_busy(dev_priv);
+	}
+
+	return ret;
+}
+
+static void release_rcs_busy(struct drm_i915_private *dev_priv)
+{
+	if (!atomic_dec_and_test(&dev_priv->uncore.hold_rcs_busy_count)) {
+		I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
+			   _MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
+	}
+}
+
+/*
+ * From Broadwell PRM, 3D-Media-GPGPU -> Register State Context
+ *
+ * MMIO reads or writes to any of the registers listed in the
+ * “Register State Context image” subsections through HOST/IA
+ * MMIO interface for debug purposes must follow the steps below:
+ *
+ * - SW should set the Force Wakeup bit to prevent GT from entering C6.
+ * - Write 0x2050[31:0] = 0x00010001 (disable sequence).
+ * - Disable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010001).
+ * - BDW:  Poll/Wait for register bits of 0x22AC[6:0] turn to 0x30 value.
+ * - SKL+: Poll/Wait for register bits of 0x22A4[6:0] turn to 0x30 value.
+ * - Read/Write to desired MMIO registers.
+ * - Enable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010000).
+ * - Force Wakeup bit should be reset to enable C6 entry.
+ */
+int intel_uncore_begin_ctx_mmio(struct drm_i915_private *dev_priv)
+{
+	int ret = 0;
+
+	BUG_ON(dev_priv->info.gen < 8);
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_RENDER);
+
+	ret = hold_rcs_busy(dev_priv);
+	if (ret)
+	    intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
+
+	return ret;
+}
+
+void intel_uncore_end_ctx_mmio(struct drm_i915_private *dev_priv)
+{
+	release_rcs_busy(dev_priv);
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_RENDER);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/intel_uncore.c"
 #endif