diff mbox

drm/i915/bxt: Broxton decoupled MMIO

Message ID 1473139454-30627-1-git-send-email-praveen.paneri@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Praveen Paneri Sept. 6, 2016, 5:24 a.m. UTC
Decoupled MMIO is an alternative way to access forcewake domain
registers, which requires less cycles and avoids frequent software
forcewake.

Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Ankitprasad Sharma <ankitprasad.r.sharma@intel.com>
Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_reg.h     |   7 +++
 drivers/gpu/drm/i915/intel_uncore.c | 113 +++++++++++++++++++++++++++++++++---
 3 files changed, 122 insertions(+), 9 deletions(-)

Comments

Chris Wilson Sept. 6, 2016, 6:36 a.m. UTC | #1
On Tue, Sep 06, 2016 at 10:54:14AM +0530, Praveen Paneri wrote:
> Decoupled MMIO is an alternative way to access forcewake domain
> registers, which requires less cycles and avoids frequent software
> forcewake.

How about when forcewake is already held? You'll note that we still
require irq-spinlocks so the mmio access is still not great. And we
still will have to frequently take forcewake manually, apparently.

Do you have any statistics to say that we do reduce grabing the fw
wakelock and that the busywait you add instead is negligible. You are
still using a 50ms timeout, so there is some doubt about "less cycles".

> +/*
> + * Decoupled MMIO access for only 1 DWORD
> + */
> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
> +					 uint32_t reg, u32 *ptr_data,
> +					 enum power_domains pd, int operation)
> +{
> +	u32 ctrl_reg_data = 0;
> +
> +	if (operation == GEN9_DECOUPLED_OP_WRITE)
> +		__raw_i915_write32(dev_priv,
> +				GEN9_DECOUPLED_REG0_DW0,
> +				*ptr_data);
> +
> +	ctrl_reg_data |= reg;
> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
> +	ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT);
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
> +			FORCEWAKE_ACK_TIMEOUT_MS))
> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
> +
> +	if (operation == GEN9_DECOUPLED_OP_READ)
> +		*ptr_data = __raw_i915_read32(dev_priv,
> +				GEN9_DECOUPLED_REG0_DW0);
> +}
> +
>  #define GEN2_READ_HEADER(x) \
>  	u##x val = 0; \
>  	assert_rpm_wakelock_held(dev_priv);
> @@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>  static u##x \
>  gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>  	enum forcewake_domains fw_engine; \
> +	enum power_domains pd_engine; \
>  	GEN6_READ_HEADER(x); \
> -	fw_engine = __gen9_reg_read_fw_domains(offset); \
> -	if (fw_engine) \
> -		__force_wake_auto(dev_priv, fw_engine); \
> -	val = __raw_i915_read##x(dev_priv, reg); \
> -	GEN6_READ_FOOTER; \
> +	pd_engine = __gen9_reg_read_power_domains(offset); \
> +	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \

Move the platform test out of here (since it is already a per-platform
vfunc) and then skip the duplicated gen9 functions.

> +		u32 *ptr_data = (u32 *) &val; \
> +		unsigned i = 0; \
> +		for (i = 0; i < x/32; i++) { \

And tidy up the reassignments.

> +			__gen9_decoupled_mmio_access(dev_priv, \
> +					(offset + i*4), \
> +					ptr_data + i, \
> +					pd_engine, \
> +					GEN9_DECOUPLED_OP_READ); \
> +			ptr_data++; \
> +		} \
> +	} else { \
> +		fw_engine = __gen9_reg_read_fw_domains(offset); \
> +		if (fw_engine) \
> +			__force_wake_auto(dev_priv, fw_engine); \
> +		val = __raw_i915_read##x(dev_priv, reg); \
> +	} \
> +		GEN6_READ_FOOTER; \

Misleading indentation.

>  }
>  
>  __gen9_read(8)
> @@ -1101,11 +1181,26 @@ static void \
>  gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
>  		bool trace) { \
>  	enum forcewake_domains fw_engine; \
> +	enum power_domains pd_engine; \
>  	GEN6_WRITE_HEADER; \
> -	fw_engine = __gen9_reg_write_fw_domains(offset); \
> -	if (fw_engine) \
> -		__force_wake_auto(dev_priv, fw_engine); \
> -	__raw_i915_write##x(dev_priv, reg, val); \
> +	pd_engine = __gen9_reg_write_power_domains(offset); \
> +	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
> +		u32 *ptr_data = (u32 *) &val; \
> +		unsigned i = 0; \
> +		for (i = 0; i < x/32; i++) { \
> +			__gen9_decoupled_mmio_access(dev_priv, \
> +					(offset + i*4), \
> +					ptr_data + i, \
> +					pd_engine, \
> +					GEN9_DECOUPLED_OP_WRITE); \
> +			ptr_data++; \
> +		} \

This is scary for a 64bit write. They are assumed to be an atomic
transaction with hw - when they are not we encounter fun races where the
hardware operates on the intermediate state. Hence we avoid them.
-Chisr
Praveen Paneri Sept. 19, 2016, 5:05 p.m. UTC | #2
On Tuesday 06 September 2016 12:06 PM, Chris Wilson wrote:
> On Tue, Sep 06, 2016 at 10:54:14AM +0530, Praveen Paneri wrote:
>> Decoupled MMIO is an alternative way to access forcewake domain
>> registers, which requires less cycles and avoids frequent software
>> forcewake.
>
> How about when forcewake is already held? You'll note that we still
Will try to add the same check (for domain->wake_count) in decoupled 
MMIO path as well and do a direct register access if forcewake is 
already held.
> require irq-spinlocks so the mmio access is still not great. And we
> still will have to frequently take forcewake manually, apparently.
>
> Do you have any statistics to say that we do reduce grabing the fw
> wakelock and that the busywait you add instead is negligible. You are
> still using a 50ms timeout, so there is some doubt about "less cycles".
Sorry didn't find any such statistics with Windows folks.
But can do an exercise myself to measure the actual benefit of Decoupled 
MMIO. Please can you suggest some method to do that.

The feature definitely helps HW for synchronization as the cycles are
internally serialized in GT and eliminates the risk of hitting certain
hangs which exist in theory.
>
>> +/*
>> + * Decoupled MMIO access for only 1 DWORD
>> + */
>> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
>> +					 uint32_t reg, u32 *ptr_data,
>> +					 enum power_domains pd, int operation)
>> +{
>> +	u32 ctrl_reg_data = 0;
>> +
>> +	if (operation == GEN9_DECOUPLED_OP_WRITE)
>> +		__raw_i915_write32(dev_priv,
>> +				GEN9_DECOUPLED_REG0_DW0,
>> +				*ptr_data);
>> +
>> +	ctrl_reg_data |= reg;
>> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>> +	ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT);
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
>> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>> +			FORCEWAKE_ACK_TIMEOUT_MS))
>> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
>> +
>> +	if (operation == GEN9_DECOUPLED_OP_READ)
>> +		*ptr_data = __raw_i915_read32(dev_priv,
>> +				GEN9_DECOUPLED_REG0_DW0);
>> +}
>> +
>>   #define GEN2_READ_HEADER(x) \
>>   	u##x val = 0; \
>>   	assert_rpm_wakelock_held(dev_priv);
>> @@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>>   static u##x \
>>   gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>>   	enum forcewake_domains fw_engine; \
>> +	enum power_domains pd_engine; \
>>   	GEN6_READ_HEADER(x); \
>> -	fw_engine = __gen9_reg_read_fw_domains(offset); \
>> -	if (fw_engine) \
>> -		__force_wake_auto(dev_priv, fw_engine); \
>> -	val = __raw_i915_read##x(dev_priv, reg); \
>> -	GEN6_READ_FOOTER; \
>> +	pd_engine = __gen9_reg_read_power_domains(offset); \
>> +	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
>
> Move the platform test out of here (since it is already a per-platform
> vfunc) and then skip the duplicated gen9 functions.
>
>> +		u32 *ptr_data = (u32 *) &val; \
>> +		unsigned i = 0; \
>> +		for (i = 0; i < x/32; i++) { \
>
> And tidy up the reassignments.
>
>> +			__gen9_decoupled_mmio_access(dev_priv, \
>> +					(offset + i*4), \
>> +					ptr_data + i, \
>> +					pd_engine, \
>> +					GEN9_DECOUPLED_OP_READ); \
>> +			ptr_data++; \
>> +		} \
>> +	} else { \
>> +		fw_engine = __gen9_reg_read_fw_domains(offset); \
>> +		if (fw_engine) \
>> +			__force_wake_auto(dev_priv, fw_engine); \
>> +		val = __raw_i915_read##x(dev_priv, reg); \
>> +	} \
>> +		GEN6_READ_FOOTER; \
>
> Misleading indentation.
>
>>   }
>>
>>   __gen9_read(8)
>> @@ -1101,11 +1181,26 @@ static void \
>>   gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
>>   		bool trace) { \
>>   	enum forcewake_domains fw_engine; \
>> +	enum power_domains pd_engine; \
>>   	GEN6_WRITE_HEADER; \
>> -	fw_engine = __gen9_reg_write_fw_domains(offset); \


>> -	if (fw_engine) \
>> -		__force_wake_auto(dev_priv, fw_engine); \
>> -	__raw_i915_write##x(dev_priv, reg, val); \
>> +	pd_engine = __gen9_reg_write_power_domains(offset); \
>> +	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
>> +		u32 *ptr_data = (u32 *) &val; \
>> +		unsigned i = 0; \
>> +		for (i = 0; i < x/32; i++) { \
>> +			__gen9_decoupled_mmio_access(dev_priv, \
>> +					(offset + i*4), \
>> +					ptr_data + i, \
>> +					pd_engine, \
>> +					GEN9_DECOUPLED_OP_WRITE); \
>> +			ptr_data++; \
>> +		} \
>
> This is scary for a 64bit write. They are assumed to be an atomic
> transaction with hw - when they are not we encounter fun races where the
> hardware operates on the intermediate state. Hence we avoid them.
Decoupled MMIO currently doesn't support single 64 bit write. We can 
continue to use existing method for 64 bit writes.
Thanks,
Praveen
> -Chisr
>
Tvrtko Ursulin Nov. 16, 2016, 9:38 a.m. UTC | #3
On 15/11/2016 18:15, Patchwork wrote:
> == Series Details ==
>
> Series: drm/i915/bxt: Broxton decoupled MMIO (rev5)
> URL   : https://patchwork.freedesktop.org/series/12028/
> State : success
>
> == Summary ==
>
> Series 12028v5 drm/i915/bxt: Broxton decoupled MMIO
> https://patchwork.freedesktop.org/api/1.0/series/12028/revisions/5/mbox/
>
>
> fi-bdw-5557u     total:244  pass:229  dwarn:0   dfail:0   fail:0   skip:15
> fi-bsw-n3050     total:244  pass:204  dwarn:0   dfail:0   fail:0   skip:40
> fi-bxt-t5700     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28
> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28
> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32
> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20
> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20
> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53
> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22
> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22
> fi-kbl-7200u     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22
> fi-skl-6260u     total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14
> fi-skl-6700hq    total:244  pass:223  dwarn:0   dfail:0   fail:0   skip:21
> fi-skl-6700k     total:244  pass:222  dwarn:1   dfail:0   fail:0   skip:21
> fi-skl-6770hq    total:244  pass:230  dwarn:0   dfail:0   fail:0   skip:14
> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32
> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33
>
> 04145fe15cf8c81c221e62fc9d65d93053f9bd1a drm-intel-nightly: 2016y-11m-15d-14h-47m-07s UTC integration manifest
> 4acf91b drm/i915/bxt: Broxton decoupled MMIO

Merged to dinq - thanks for the patch!

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c413587..1ecda04 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -558,6 +558,16 @@  enum forcewake_domains {
 #define FW_REG_READ  (1)
 #define FW_REG_WRITE (2)
 
+enum power_domains {
+	GEN9_DECOUPLED_PD_BLITTER = 0,
+	GEN9_DECOUPLED_PD_RENDER,
+	GEN9_DECOUPLED_PD_MEDIA,
+	GEN9_DECOUPLED_PD_ALL
+};
+
+#define GEN9_DECOUPLED_OP_WRITE 0
+#define GEN9_DECOUPLED_OP_READ 1
+
 enum forcewake_domains
 intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
 			       i915_reg_t reg, unsigned int op);
@@ -2842,6 +2852,7 @@  struct drm_i915_cmd_table {
 #define GT_FREQUENCY_MULTIPLIER 50
 #define GEN9_FREQ_SCALER 3
 
+#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) && IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
 #include "i915_trace.h"
 
 static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a29d707..1a7acdf 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7394,6 +7394,13 @@  enum {
 #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
 #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
 
+/* Decoupled MMIO register pair for kernel driver */
+#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)
+#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)
+#define GEN9_DECOUPLED_DW1_GO			(1<<31)
+#define GEN9_DECOUPLED_PD_SHIFT			28
+#define GEN9_DECOUPLED_OP_SHIFT			24
+
 /* Per-pipe DDI Function Control */
 #define _TRANS_DDI_FUNC_CTL_A		0x60400
 #define _TRANS_DDI_FUNC_CTL_B		0x61400
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index e9f68cd..d19ee2f 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -745,6 +745,22 @@  static bool is_gen8_shadowed(u32 offset)
 	__fwd; \
 })
 
+#define __gen9_reg_read_power_domains(offset) \
+({ \
+	enum power_domains __dpd; \
+	if (!SKL_NEEDS_FORCE_WAKE(offset)) \
+		__dpd = 0; \
+	else if (FORCEWAKE_GEN9_RENDER_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_RENDER; \
+	else if (FORCEWAKE_GEN9_MEDIA_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_MEDIA; \
+	else if (FORCEWAKE_GEN9_COMMON_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_ALL; \
+	else \
+		__dpd = GEN9_DECOUPLED_PD_BLITTER; \
+	__dpd; \
+})
+
 static const i915_reg_t gen9_shadowed_regs[] = {
 	RING_TAIL(RENDER_RING_BASE),
 	RING_TAIL(GEN6_BSD_RING_BASE),
@@ -781,6 +797,23 @@  static bool is_gen9_shadowed(u32 offset)
 	__fwd; \
 })
 
+#define __gen9_reg_write_power_domains(offset) \
+({ \
+	enum power_domains __dpd; \
+	if (!SKL_NEEDS_FORCE_WAKE(offset) || is_gen9_shadowed(offset)) \
+		__dpd = 0; \
+	else if (FORCEWAKE_GEN9_RENDER_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_RENDER; \
+	else if (FORCEWAKE_GEN9_MEDIA_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_MEDIA; \
+	else if (FORCEWAKE_GEN9_COMMON_RANGE_OFFSET(offset)) \
+		__dpd = GEN9_DECOUPLED_PD_ALL; \
+	else \
+		__dpd = GEN9_DECOUPLED_PD_BLITTER; \
+	__dpd; \
+})
+
+
 static void
 ilk_dummy_write(struct drm_i915_private *dev_priv)
 {
@@ -816,6 +849,38 @@  unclaimed_reg_debug(struct drm_i915_private *dev_priv,
 	__unclaimed_reg_debug(dev_priv, reg, read, before);
 }
 
+/*
+ * Decoupled MMIO access for only 1 DWORD
+ */
+static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
+					 uint32_t reg, u32 *ptr_data,
+					 enum power_domains pd, int operation)
+{
+	u32 ctrl_reg_data = 0;
+
+	if (operation == GEN9_DECOUPLED_OP_WRITE)
+		__raw_i915_write32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0,
+				*ptr_data);
+
+	ctrl_reg_data |= reg;
+	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
+	ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT);
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	if (wait_for_atomic((__raw_i915_read32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
+			FORCEWAKE_ACK_TIMEOUT_MS))
+		DRM_ERROR("Decoupled MMIO wait timed out\n");
+
+	if (operation == GEN9_DECOUPLED_OP_READ)
+		*ptr_data = __raw_i915_read32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0);
+}
+
 #define GEN2_READ_HEADER(x) \
 	u##x val = 0; \
 	assert_rpm_wakelock_held(dev_priv);
@@ -932,12 +997,27 @@  chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
 static u##x \
 gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
 	enum forcewake_domains fw_engine; \
+	enum power_domains pd_engine; \
 	GEN6_READ_HEADER(x); \
-	fw_engine = __gen9_reg_read_fw_domains(offset); \
-	if (fw_engine) \
-		__force_wake_auto(dev_priv, fw_engine); \
-	val = __raw_i915_read##x(dev_priv, reg); \
-	GEN6_READ_FOOTER; \
+	pd_engine = __gen9_reg_read_power_domains(offset); \
+	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
+		u32 *ptr_data = (u32 *) &val; \
+		unsigned i = 0; \
+		for (i = 0; i < x/32; i++) { \
+			__gen9_decoupled_mmio_access(dev_priv, \
+					(offset + i*4), \
+					ptr_data + i, \
+					pd_engine, \
+					GEN9_DECOUPLED_OP_READ); \
+			ptr_data++; \
+		} \
+	} else { \
+		fw_engine = __gen9_reg_read_fw_domains(offset); \
+		if (fw_engine) \
+			__force_wake_auto(dev_priv, fw_engine); \
+		val = __raw_i915_read##x(dev_priv, reg); \
+	} \
+		GEN6_READ_FOOTER; \
 }
 
 __gen9_read(8)
@@ -1101,11 +1181,26 @@  static void \
 gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
 		bool trace) { \
 	enum forcewake_domains fw_engine; \
+	enum power_domains pd_engine; \
 	GEN6_WRITE_HEADER; \
-	fw_engine = __gen9_reg_write_fw_domains(offset); \
-	if (fw_engine) \
-		__force_wake_auto(dev_priv, fw_engine); \
-	__raw_i915_write##x(dev_priv, reg, val); \
+	pd_engine = __gen9_reg_write_power_domains(offset); \
+	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
+		u32 *ptr_data = (u32 *) &val; \
+		unsigned i = 0; \
+		for (i = 0; i < x/32; i++) { \
+			__gen9_decoupled_mmio_access(dev_priv, \
+					(offset + i*4), \
+					ptr_data + i, \
+					pd_engine, \
+					GEN9_DECOUPLED_OP_WRITE); \
+			ptr_data++; \
+		} \
+	} else { \
+		fw_engine = __gen9_reg_write_fw_domains(offset); \
+		if (fw_engine) \
+			__force_wake_auto(dev_priv, fw_engine); \
+		__raw_i915_write##x(dev_priv, reg, val); \
+	} \
 	GEN6_WRITE_FOOTER; \
 }