diff mbox

[v2] drm/i915/bxt: Broxton decoupled MMIO

Message ID 1474305302-14503-1-git-send-email-praveen.paneri@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Praveen Paneri Sept. 19, 2016, 5:15 p.m. UTC
Decoupled MMIO is an alternative way to access forcewake domain
registers, which requires less cycles and avoids frequent software
forcewake.

v2:
- Moved platform check out of the function and got rid of duplicate
functions to find out decoupled power domain.
- Added a check for forcewake already held and skipped decoupled access
- Skipped writing 64 bit register through decoupled MMIO

Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Ankitprasad Sharma <ankitprasad.r.sharma@intel.com>
Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
 drivers/gpu/drm/i915/i915_reg.h     |   7 +++
 drivers/gpu/drm/i915/intel_uncore.c | 118 +++++++++++++++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 2 deletions(-)

Comments

Tvrtko Ursulin Sept. 23, 2016, 9:49 a.m. UTC | #1
Hi,

On 19/09/2016 18:15, Praveen Paneri wrote:

> Decoupled MMIO is an alternative way to access forcewake domain
> registers, which requires less cycles and avoids frequent software
> forcewake.

I would like to see a sentence or two on how it works here.

> v2:
> - Moved platform check out of the function and got rid of duplicate
> functions to find out decoupled power domain.
> - Added a check for forcewake already held and skipped decoupled access
> - Skipped writing 64 bit register through decoupled MMIO
>
> Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
> Signed-off-by: Ankitprasad Sharma <ankitprasad.r.sharma@intel.com>
> Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>

How come this list of SoBs?

> ---
>   drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>   drivers/gpu/drm/i915/i915_reg.h     |   7 +++
>   drivers/gpu/drm/i915/intel_uncore.c | 118 +++++++++++++++++++++++++++++++++++-
>   3 files changed, 134 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4dd307e..065247b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -558,6 +558,16 @@ enum forcewake_domains {
>   #define FW_REG_READ  (1)
>   #define FW_REG_WRITE (2)
>   
> +enum power_domains {

If this is specific to decouples mmio maybe call it decoupled_power_domains?

> +	GEN9_DECOUPLED_PD_BLITTER = 0,
> +	GEN9_DECOUPLED_PD_RENDER,
> +	GEN9_DECOUPLED_PD_MEDIA,
> +	GEN9_DECOUPLED_PD_ALL
> +};
> +
> +#define GEN9_DECOUPLED_OP_WRITE (0)
> +#define GEN9_DECOUPLED_OP_READ (1)

Would it be more consistent to make these ones enums as well?

> +
>   enum forcewake_domains
>   intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>   			       i915_reg_t reg, unsigned int op);
> @@ -2854,6 +2864,7 @@ struct drm_i915_cmd_table {
>   #define GT_FREQUENCY_MULTIPLIER 50
>   #define GEN9_FREQ_SCALER 3
>   
> +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) && IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))

There is a recent patch series from Carlos Santa which moved all these 
type of things to device info. So I think you have to make this 
compliant with that new style.

>   #include "i915_trace.h"
>   
>   static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 70d9616..be59803 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -7394,6 +7394,13 @@ enum {
>   #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
>   #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
>   
> +/* Decoupled MMIO register pair for kernel driver */
> +#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)
> +#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)
> +#define GEN9_DECOUPLED_DW1_GO			(1<<31)
> +#define GEN9_DECOUPLED_PD_SHIFT			28
> +#define GEN9_DECOUPLED_OP_SHIFT			24
> +
>   /* Per-pipe DDI Function Control */
>   #define _TRANS_DDI_FUNC_CTL_A		0x60400
>   #define _TRANS_DDI_FUNC_CTL_B		0x61400
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index a9b6c93..06fffba 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -816,6 +816,42 @@ unclaimed_reg_debug(struct drm_i915_private *dev_priv,
>   	__unclaimed_reg_debug(dev_priv, reg, read, before);
>   }
>   
> +/*
> + * Decoupled MMIO access for only 1 DWORD
> + */
> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
> +					 uint32_t reg, u32 *ptr_data,

Seeing uint32_t and u32 on the same line looks a bit untidy to me. I 
think you should only use one and that in kernel u32 is preferred.

> +					 enum forcewake_domains fw_engine, int operation)
> +{
> +	enum power_domains pd_engine;
> +	u32 ctrl_reg_data = 0;
> +
> +	if (operation == GEN9_DECOUPLED_OP_WRITE)
> +		__raw_i915_write32(dev_priv,
> +				GEN9_DECOUPLED_REG0_DW0,
> +				*ptr_data);
> +
> +	pd_engine = (fw_engine == (FORCEWAKE_RENDER || FORCEWAKE_BLITTER)) ? \
> +		     !(fw_engine >> 1) : (fw_engine >> 1); \

Feels that look up table would be better.

> +
> +	ctrl_reg_data |= reg;
> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
> +	ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
> +			FORCEWAKE_ACK_TIMEOUT_MS))
> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
> +

Is FORCEWAKE_ACK_TIMEOUT_MS correct/relevant for decoupled MMIO? It is 
potentially quite a long atomic wait.

Would it be worth returning some fixed value in the case of a timeout? 
Might be better than random stuff? (applies in the 64-bit read case)

> +	if (operation == GEN9_DECOUPLED_OP_READ)
> +		*ptr_data = __raw_i915_read32(dev_priv,
> +				GEN9_DECOUPLED_REG0_DW0);
> +}
> +
>   #define GEN2_READ_HEADER(x) \
>   	u##x val = 0; \
>   	assert_rpm_wakelock_held(dev_priv);
> @@ -892,6 +928,20 @@ static inline void __force_wake_auto(struct drm_i915_private *dev_priv,
>   		dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains);
>   }
>   
> +static inline bool __is_forcewake_active(struct drm_i915_private *dev_priv,
> +					 enum forcewake_domains fw_domains)
> +{
> +	struct intel_uncore_forcewake_domain *domain;
> +
> +	/* Ideally GCC would be constant-fold and eliminate this loop */
> +	for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
> +		if (domain->wake_count)
> +			fw_domains &= ~domain->mask;
> +	}
> +
> +	return fw_domains ? 0 : 1;
> +}
> +
>   #define __gen6_read(x) \
>   static u##x \
>   gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
> @@ -940,6 +990,37 @@ gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>   	GEN6_READ_FOOTER; \
>   }
>   
> +#define __gen9_decoupled_read(x) \
> +static u##x \
> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
> +	enum forcewake_domains fw_engine; \

fw_engines

> +	GEN6_READ_HEADER(x); \
> +	fw_engine = __gen9_reg_read_fw_domains(offset); \
> +	if (fw_engine && x%32 == 0) { \
> +		if (__is_forcewake_active(dev_priv, fw_engine)) \
> +			__raw_i915_write##x(dev_priv, reg, val); \

Write in the read macro, I don't understand!?

Also, would a single mmio read call be possible, something like below?

if (x % 32 || !fw_engines || __is_forcewake_active()) {
     if (fw_engines)
         __force_wake_auto
      __raw_i915_read
} else {
     ... decoupled mmio loop ...
}

I might have made an oversight, no guarantees that I haven't. :)

> +		else { \
> +			unsigned i; \
> +			u32 *ptr_data = (u32 *) &val; \

Hm, curios if taking the address of val causes some issues for code 
generation. I am not set up yet to try it out myself, but would be 
interesting to see the difference between your approach and the one 
where reads are done via return value and not pointer writes. Even if it 
means refactoring the common accessor a bit.

> +			for (i = 0; i < x/32; i++) \

Could you check if for the 64-bit read compiler manages to unroll this loop?

> +				__gen9_decoupled_mmio_access(dev_priv, \
> +						(offset + i*4), \

Hopefully compiler manages to optimize, but please check and in case it 
doesn't consider offsett += sizeof(u32) in the for statement as an 
alternative.

> +						ptr_data + i, \
> +						fw_engine, \
> +						GEN9_DECOUPLED_OP_READ); \
> +		} \
> +	} else { \
> +		if (fw_engine) \
> +			__force_wake_auto(dev_priv, fw_engine); \
> +		val = __raw_i915_read##x(dev_priv, reg); \
> +	} \
> +	GEN6_READ_FOOTER; \
> +}
> +
> +__gen9_decoupled_read(8)
> +__gen9_decoupled_read(16)

What is the value of defining 8 and 16-bit version of those when they 
won't use the decoupled mmio path at all? Wouldn't it be better to just 
assign a mix of gen9 mmio functions in that case?

> +__gen9_decoupled_read(32)
> +__gen9_decoupled_read(64)
>   __gen9_read(8)
>   __gen9_read(16)
>   __gen9_read(32)
> @@ -1107,6 +1188,34 @@ gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
>   	GEN6_WRITE_FOOTER; \
>   }
>   
> +#define __gen9_decoupled_write(x) \
> +static void \
> +gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
> +		bool trace) { \
> +	enum forcewake_domains fw_engine; \
> +	GEN6_WRITE_HEADER; \
> +	fw_engine = __gen9_reg_write_fw_domains(offset); \
> +	if (fw_engine && x == 32) { \
> +		u32 *ptr_data = (u32 *) &val; \
> +		if (__is_forcewake_active(dev_priv, fw_engine)) \
> +			__raw_i915_write##x(dev_priv, reg, val); \
> +		else \
> +			__gen9_decoupled_mmio_access(dev_priv, \
> +				offset, \
> +				ptr_data, \
> +				fw_engine, \
> +				GEN9_DECOUPLED_OP_WRITE); \
> +	} else { \
> +		if (fw_engine) \
> +			__force_wake_auto(dev_priv, fw_engine); \
> +		__raw_i915_write##x(dev_priv, reg, val); \
> +	} \
> +	GEN6_WRITE_FOOTER; \
> +}
> +
> +__gen9_decoupled_write(8)
> +__gen9_decoupled_write(16)

All same comments as for the read block.

> +__gen9_decoupled_write(32)
>   __gen9_write(8)
>   __gen9_write(16)
>   __gen9_write(32)
> @@ -1328,8 +1437,13 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>   	switch (INTEL_INFO(dev_priv)->gen) {
>   	default:
>   	case 9:
> -		ASSIGN_WRITE_MMIO_VFUNCS(gen9);
> -		ASSIGN_READ_MMIO_VFUNCS(gen9);
> +		if (HAS_DECOUPLED_MMIO(dev_priv)) {
> +			ASSIGN_WRITE_MMIO_VFUNCS(gen9_decoupled);
> +			ASSIGN_READ_MMIO_VFUNCS(gen9_decoupled);
> +		} else {
> +			ASSIGN_WRITE_MMIO_VFUNCS(gen9);
> +			ASSIGN_READ_MMIO_VFUNCS(gen9);
> +		}

So with the suggestion from above this would have to be open coded to 
only override 32 and 64 bit ops with decoupled versions, but I think 
that is better than having duplicated code compiled in the module.

>   		break;
>   	case 8:
>   		if (IS_CHERRYVIEW(dev_priv)) {

Regards,

Tvrtko
Praveen Paneri Sept. 26, 2016, 11:08 a.m. UTC | #2
On 9/23/2016 3:19 PM, Tvrtko Ursulin wrote:
>
> Hi,
>
> On 19/09/2016 18:15, Praveen Paneri wrote:
>
>> Decoupled MMIO is an alternative way to access forcewake domain
>> registers, which requires less cycles and avoids frequent software
>> forcewake.
>
> I would like to see a sentence or two on how it works here.
Sure
>
>> v2:
>> - Moved platform check out of the function and got rid of duplicate
>> functions to find out decoupled power domain.
>> - Added a check for forcewake already held and skipped decoupled access
>> - Skipped writing 64 bit register through decoupled MMIO
>>
>> Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
>> Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
>> Signed-off-by: Ankitprasad Sharma <ankitprasad.r.sharma@intel.com>
>> Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>
>
> How come this list of SoBs?
Will clean it up
>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>>   drivers/gpu/drm/i915/i915_reg.h     |   7 +++
>>   drivers/gpu/drm/i915/intel_uncore.c | 118
>> +++++++++++++++++++++++++++++++++++-
>>   3 files changed, 134 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 4dd307e..065247b 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -558,6 +558,16 @@ enum forcewake_domains {
>>   #define FW_REG_READ  (1)
>>   #define FW_REG_WRITE (2)
>>   +enum power_domains {
>
> If this is specific to decouples mmio maybe call it
> decoupled_power_domains?
sure
>
>> +    GEN9_DECOUPLED_PD_BLITTER = 0,
>> +    GEN9_DECOUPLED_PD_RENDER,
>> +    GEN9_DECOUPLED_PD_MEDIA,
>> +    GEN9_DECOUPLED_PD_ALL
>> +};
>> +
>> +#define GEN9_DECOUPLED_OP_WRITE (0)
>> +#define GEN9_DECOUPLED_OP_READ (1)
>
> Would it be more consistent to make these ones enums as well?
yes, will do that
>
>> +
>>   enum forcewake_domains
>>   intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>>                      i915_reg_t reg, unsigned int op);
>> @@ -2854,6 +2864,7 @@ struct drm_i915_cmd_table {
>>   #define GT_FREQUENCY_MULTIPLIER 50
>>   #define GEN9_FREQ_SCALER 3
>>   +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) &&
>> IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
>
> There is a recent patch series from Carlos Santa which moved all these
> type of things to device info. So I think you have to make this
> compliant with that new style.
I looked into it. Could you suggest where can I add the check for BXT C0 
revision?
Will this be okay
#define HAS_DECOUPLED_MMIO(dev)   (INTEL_INFO(dev)->has_decoupled_mmio 
&& IS_BXT_REVID(to_i915(dev), BXT_REVID_C0, REVID_FOREVER))
>
>>   #include "i915_trace.h"
>>     static inline bool intel_scanout_needs_vtd_wa(struct
>> drm_i915_private *dev_priv)
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h
>> b/drivers/gpu/drm/i915/i915_reg.h
>> index 70d9616..be59803 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -7394,6 +7394,13 @@ enum {
>>   #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
>>   #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
>>   +/* Decoupled MMIO register pair for kernel driver */
>> +#define GEN9_DECOUPLED_REG0_DW0            _MMIO(0xF00)
>> +#define GEN9_DECOUPLED_REG0_DW1            _MMIO(0xF04)
>> +#define GEN9_DECOUPLED_DW1_GO            (1<<31)
>> +#define GEN9_DECOUPLED_PD_SHIFT            28
>> +#define GEN9_DECOUPLED_OP_SHIFT            24
>> +
>>   /* Per-pipe DDI Function Control */
>>   #define _TRANS_DDI_FUNC_CTL_A        0x60400
>>   #define _TRANS_DDI_FUNC_CTL_B        0x61400
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c
>> b/drivers/gpu/drm/i915/intel_uncore.c
>> index a9b6c93..06fffba 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -816,6 +816,42 @@ unclaimed_reg_debug(struct drm_i915_private
>> *dev_priv,
>>       __unclaimed_reg_debug(dev_priv, reg, read, before);
>>   }
>>   +/*
>> + * Decoupled MMIO access for only 1 DWORD
>> + */
>> +static void __gen9_decoupled_mmio_access(struct drm_i915_private
>> *dev_priv,
>> +                     uint32_t reg, u32 *ptr_data,
>
> Seeing uint32_t and u32 on the same line looks a bit untidy to me. I
> think you should only use one and that in kernel u32 is preferred.
sure
>
>> +                     enum forcewake_domains fw_engine, int operation)
>> +{
>> +    enum power_domains pd_engine;
>> +    u32 ctrl_reg_data = 0;
>> +
>> +    if (operation == GEN9_DECOUPLED_OP_WRITE)
>> +        __raw_i915_write32(dev_priv,
>> +                GEN9_DECOUPLED_REG0_DW0,
>> +                *ptr_data);
>> +
>> +    pd_engine = (fw_engine == (FORCEWAKE_RENDER ||
>> FORCEWAKE_BLITTER)) ? \
>> +             !(fw_engine >> 1) : (fw_engine >> 1); \
>
> Feels that look up table would be better.
>
>> +
>> +    ctrl_reg_data |= reg;
>> +    ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>> +    ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>> ctrl_reg_data);
>> +
>> +    ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>> ctrl_reg_data);
>> +
>> +    if (wait_for_atomic((__raw_i915_read32(dev_priv,
>> +            GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>> +            FORCEWAKE_ACK_TIMEOUT_MS))
>> +        DRM_ERROR("Decoupled MMIO wait timed out\n");
>> +
>
> Is FORCEWAKE_ACK_TIMEOUT_MS correct/relevant for decoupled MMIO? It is
> potentially quite a long atomic wait.
This is max wait time. We might not wait for that long but I can still 
check on it.
>
> Would it be worth returning some fixed value in the case of a timeout?
> Might be better than random stuff? (applies in the 64-bit read case)
This is same as forcewake implementation. If we change it, what would be 
the appropriate fixed value to be returned?
>
>> +    if (operation == GEN9_DECOUPLED_OP_READ)
>> +        *ptr_data = __raw_i915_read32(dev_priv,
>> +                GEN9_DECOUPLED_REG0_DW0);
>> +}
>> +
>>   #define GEN2_READ_HEADER(x) \
>>       u##x val = 0; \
>>       assert_rpm_wakelock_held(dev_priv);
>> @@ -892,6 +928,20 @@ static inline void __force_wake_auto(struct
>> drm_i915_private *dev_priv,
>>           dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains);
>>   }
>>   +static inline bool __is_forcewake_active(struct drm_i915_private
>> *dev_priv,
>> +                     enum forcewake_domains fw_domains)
>> +{
>> +    struct intel_uncore_forcewake_domain *domain;
>> +
>> +    /* Ideally GCC would be constant-fold and eliminate this loop */
>> +    for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
>> +        if (domain->wake_count)
>> +            fw_domains &= ~domain->mask;
>> +    }
>> +
>> +    return fw_domains ? 0 : 1;
>> +}
>> +
>>   #define __gen6_read(x) \
>>   static u##x \
>>   gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool
>> trace) { \
>> @@ -940,6 +990,37 @@ gen9_read##x(struct drm_i915_private *dev_priv,
>> i915_reg_t reg, bool trace) { \
>>       GEN6_READ_FOOTER; \
>>   }
>>   +#define __gen9_decoupled_read(x) \
>> +static u##x \
>> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t
>> reg, bool trace) { \
>> +    enum forcewake_domains fw_engine; \
>
> fw_engines
>
>> +    GEN6_READ_HEADER(x); \
>> +    fw_engine = __gen9_reg_read_fw_domains(offset); \
>> +    if (fw_engine && x%32 == 0) { \
>> +        if (__is_forcewake_active(dev_priv, fw_engine)) \
>> +            __raw_i915_write##x(dev_priv, reg, val); \
>
> Write in the read macro, I don't understand!?
typo, I will fix it.
>
> Also, would a single mmio read call be possible, something like below?
>
> if (x % 32 || !fw_engines || __is_forcewake_active()) {
>     if (fw_engines)
>         __force_wake_auto
>      __raw_i915_read
> } else {
>     ... decoupled mmio loop ...
> }
>
> I might have made an oversight, no guarantees that I haven't. :)
Looks fine. Will test this
>
>> +        else { \
>> +            unsigned i; \
>> +            u32 *ptr_data = (u32 *) &val; \
>
> Hm, curios if taking the address of val causes some issues for code
> generation. I am not set up yet to try it out myself, but would be
> interesting to see the difference between your approach and the one
> where reads are done via return value and not pointer writes. Even if it
> means refactoring the common accessor a bit.
>
>> +            for (i = 0; i < x/32; i++) \
>
> Could you check if for the 64-bit read compiler manages to unroll this
> loop?
>
>> +                __gen9_decoupled_mmio_access(dev_priv, \
>> +                        (offset + i*4), \
>
> Hopefully compiler manages to optimize, but please check and in case it
> doesn't consider offsett += sizeof(u32) in the for statement as an
> alternative.
Will check the assembly and let you know how compiler generates the code.
>
>> +                        ptr_data + i, \
>> +                        fw_engine, \
>> +                        GEN9_DECOUPLED_OP_READ); \
>> +        } \
>> +    } else { \
>> +        if (fw_engine) \
>> +            __force_wake_auto(dev_priv, fw_engine); \
>> +        val = __raw_i915_read##x(dev_priv, reg); \
>> +    } \
>> +    GEN6_READ_FOOTER; \
>> +}
>> +
>> +__gen9_decoupled_read(8)
>> +__gen9_decoupled_read(16)
>
> What is the value of defining 8 and 16-bit version of those when they
> won't use the decoupled mmio path at all? Wouldn't it be better to just
> assign a mix of gen9 mmio functions in that case?
Will fix it as per below suggestion.
>
>> +__gen9_decoupled_read(32)
>> +__gen9_decoupled_read(64)
>>   __gen9_read(8)
>>   __gen9_read(16)
>>   __gen9_read(32)
>> @@ -1107,6 +1188,34 @@ gen9_write##x(struct drm_i915_private
>> *dev_priv, i915_reg_t reg, u##x val, \
>>       GEN6_WRITE_FOOTER; \
>>   }
>>   +#define __gen9_decoupled_write(x) \
>> +static void \
>> +gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t
>> reg, u##x val, \
>> +        bool trace) { \
>> +    enum forcewake_domains fw_engine; \
>> +    GEN6_WRITE_HEADER; \
>> +    fw_engine = __gen9_reg_write_fw_domains(offset); \
>> +    if (fw_engine && x == 32) { \
>> +        u32 *ptr_data = (u32 *) &val; \
>> +        if (__is_forcewake_active(dev_priv, fw_engine)) \
>> +            __raw_i915_write##x(dev_priv, reg, val); \
>> +        else \
>> +            __gen9_decoupled_mmio_access(dev_priv, \
>> +                offset, \
>> +                ptr_data, \
>> +                fw_engine, \
>> +                GEN9_DECOUPLED_OP_WRITE); \
>> +    } else { \
>> +        if (fw_engine) \
>> +            __force_wake_auto(dev_priv, fw_engine); \
>> +        __raw_i915_write##x(dev_priv, reg, val); \
>> +    } \
>> +    GEN6_WRITE_FOOTER; \
>> +}
>> +
>> +__gen9_decoupled_write(8)
>> +__gen9_decoupled_write(16)
>
> All same comments as for the read block.
>
>> +__gen9_decoupled_write(32)
>>   __gen9_write(8)
>>   __gen9_write(16)
>>   __gen9_write(32)
>> @@ -1328,8 +1437,13 @@ void intel_uncore_init(struct drm_i915_private
>> *dev_priv)
>>       switch (INTEL_INFO(dev_priv)->gen) {
>>       default:
>>       case 9:
>> -        ASSIGN_WRITE_MMIO_VFUNCS(gen9);
>> -        ASSIGN_READ_MMIO_VFUNCS(gen9);
>> +        if (HAS_DECOUPLED_MMIO(dev_priv)) {
>> +            ASSIGN_WRITE_MMIO_VFUNCS(gen9_decoupled);
>> +            ASSIGN_READ_MMIO_VFUNCS(gen9_decoupled);
>> +        } else {
>> +            ASSIGN_WRITE_MMIO_VFUNCS(gen9);
>> +            ASSIGN_READ_MMIO_VFUNCS(gen9);
>> +        }
>
> So with the suggestion from above this would have to be open coded to
> only override 32 and 64 bit ops with decoupled versions, but I think
> that is better than having duplicated code compiled in the module.
>
>>           break;
>>       case 8:
>>           if (IS_CHERRYVIEW(dev_priv)) {
>
> Regards,
>
> Tvrtko
>
Thanks,
Praveen
Tvrtko Ursulin Sept. 26, 2016, 8:23 p.m. UTC | #3
On 26/09/2016 12:08, Paneri, Praveen wrote:
>
> On 9/23/2016 3:19 PM, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 19/09/2016 18:15, Praveen Paneri wrote:
>>

[snip]

>>
>>> +
>>>   enum forcewake_domains
>>>   intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>>>                      i915_reg_t reg, unsigned int op);
>>> @@ -2854,6 +2864,7 @@ struct drm_i915_cmd_table {
>>>   #define GT_FREQUENCY_MULTIPLIER 50
>>>   #define GEN9_FREQ_SCALER 3
>>>   +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) &&
>>> IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
>>
>> There is a recent patch series from Carlos Santa which moved all these
>> type of things to device info. So I think you have to make this
>> compliant with that new style.
> I looked into it. Could you suggest where can I add the check for BXT 
> C0 revision?
> Will this be okay
> #define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio 
> && IS_BXT_REVID(to_i915(dev), BXT_REVID_C0, REVID_FOREVER))

Good point. I suggest a quick chat with Carlos then to see what was the 
plan for situation like this one.

[snip]

>>
>>> +
>>> +    ctrl_reg_data |= reg;
>>> +    ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>>> +    ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
>>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>>> ctrl_reg_data);
>>> +
>>> +    ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>>> ctrl_reg_data);
>>> +
>>> +    if (wait_for_atomic((__raw_i915_read32(dev_priv,
>>> +            GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>>> +            FORCEWAKE_ACK_TIMEOUT_MS))
>>> +        DRM_ERROR("Decoupled MMIO wait timed out\n");
>>> +
>>
>> Is FORCEWAKE_ACK_TIMEOUT_MS correct/relevant for decoupled MMIO? It is
>> potentially quite a long atomic wait.
> This is max wait time. We might not wait for that long but I can still 
> check on it.

Cool, I do think we need to know and document this in the commit and/or 
code comments where a brief explanation of decoupled mmio will be.

>>
>> Would it be worth returning some fixed value in the case of a timeout?
>> Might be better than random stuff? (applies in the 64-bit read case)
> This is same as forcewake implementation. If we change it, what would 
> be the appropriate fixed value to be returned?

Another good point. In that case I suppose it doesn't matter so can 
leave it like it was. It can only theoretically affect 64-bit reads, yes?

>>
>>> +    if (operation == GEN9_DECOUPLED_OP_READ)
>>> +        *ptr_data = __raw_i915_read32(dev_priv,
>>> +                GEN9_DECOUPLED_REG0_DW0);
>>> +}
>>> +
>>>   #define GEN2_READ_HEADER(x) \
>>>       u##x val = 0; \
>>>       assert_rpm_wakelock_held(dev_priv);
>>> @@ -892,6 +928,20 @@ static inline void __force_wake_auto(struct
>>> drm_i915_private *dev_priv,
>>>           dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains);
>>>   }
>>>   +static inline bool __is_forcewake_active(struct drm_i915_private
>>> *dev_priv,
>>> +                     enum forcewake_domains fw_domains)
>>> +{
>>> +    struct intel_uncore_forcewake_domain *domain;
>>> +
>>> +    /* Ideally GCC would be constant-fold and eliminate this loop */
>>> +    for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
>>> +        if (domain->wake_count)
>>> +            fw_domains &= ~domain->mask;
>>> +    }
>>> +
>>> +    return fw_domains ? 0 : 1;
>>> +}
>>> +
>>>   #define __gen6_read(x) \
>>>   static u##x \
>>>   gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool
>>> trace) { \
>>> @@ -940,6 +990,37 @@ gen9_read##x(struct drm_i915_private *dev_priv,
>>> i915_reg_t reg, bool trace) { \
>>>       GEN6_READ_FOOTER; \
>>>   }
>>>   +#define __gen9_decoupled_read(x) \
>>> +static u##x \
>>> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t
>>> reg, bool trace) { \
>>> +    enum forcewake_domains fw_engine; \
>>
>> fw_engines
>>
>>> +    GEN6_READ_HEADER(x); \
>>> +    fw_engine = __gen9_reg_read_fw_domains(offset); \
>>> +    if (fw_engine && x%32 == 0) { \
>>> +        if (__is_forcewake_active(dev_priv, fw_engine)) \
>>> +            __raw_i915_write##x(dev_priv, reg, val); \
>>
>> Write in the read macro, I don't understand!?
> typo, I will fix it.
>>
>> Also, would a single mmio read call be possible, something like below?
>>
>> if (x % 32 || !fw_engines || __is_forcewake_active()) {
>>     if (fw_engines)
>>         __force_wake_auto
>>      __raw_i915_read
>> } else {
>>     ... decoupled mmio loop ...
>> }
>>
>> I might have made an oversight, no guarantees that I haven't. :)
> Looks fine. Will test this

Cool, the only thing which would still bug me here is the verboseness of 
__is_forcewake_active but I have no smart ideas for that at the moment. 
Perhaps keeping a bitmask of active domains in the core? Could be worth 
it if for nothing for elegance.

Regards,

Tvrtko
Santa, Carlos Oct. 10, 2016, 5:03 p.m. UTC | #4
On Mon, 2016-09-26 at 21:23 +0100, Tvrtko Ursulin wrote:
> On 26/09/2016 12:08, Paneri, Praveen wrote:
> > 
> > 
> > On 9/23/2016 3:19 PM, Tvrtko Ursulin wrote:
> > > 
> > > 
> > > Hi,
> > > 
> > > On 19/09/2016 18:15, Praveen Paneri wrote:
> > > 
> [snip]
> 
> > 
> > > 
> > > 
> > > > 
> > > > +
> > > >   enum forcewake_domains
> > > >   intel_uncore_forcewake_for_reg(struct drm_i915_private
> > > > *dev_priv,
> > > >                      i915_reg_t reg, unsigned int op);
> > > > @@ -2854,6 +2864,7 @@ struct drm_i915_cmd_table {
> > > >   #define GT_FREQUENCY_MULTIPLIER 50
> > > >   #define GEN9_FREQ_SCALER 3
> > > >   +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv)
> > > > &&
> > > > IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
> > > There is a recent patch series from Carlos Santa which moved all
> > > these
> > > type of things to device info. So I think you have to make this
> > > compliant with that new style.
> > I looked into it. Could you suggest where can I add the check for
> > BXT 
> > C0 revision?
> > Will this be okay
> > #define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)-
> > >has_decoupled_mmio 
> > && IS_BXT_REVID(to_i915(dev), BXT_REVID_C0, REVID_FOREVER))
> Good point. I suggest a quick chat with Carlos then to see what was
> the 
> plan for situation like this one.
> 
> [snip]

This looks good to me, the main point was to do some clean-up for the
legacy features already there but also to move to a single approach
(device info) when adding new features. I don't see any other way to
specifically check for that version of BXT and this MMIO feature, so
that looks good too.

Carlos


> 
> > 
> > > 
> > > 
> > > > 
> > > > +
> > > > +    ctrl_reg_data |= reg;
> > > > +    ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
> > > > +    ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
> > > > +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
> > > > ctrl_reg_data);
> > > > +
> > > > +    ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
> > > > +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
> > > > ctrl_reg_data);
> > > > +
> > > > +    if (wait_for_atomic((__raw_i915_read32(dev_priv,
> > > > +            GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO)
> > > > == 0,
> > > > +            FORCEWAKE_ACK_TIMEOUT_MS))
> > > > +        DRM_ERROR("Decoupled MMIO wait timed out\n");
> > > > +
> > > Is FORCEWAKE_ACK_TIMEOUT_MS correct/relevant for decoupled MMIO?
> > > It is
> > > potentially quite a long atomic wait.
> > This is max wait time. We might not wait for that long but I can
> > still 
> > check on it.
> Cool, I do think we need to know and document this in the commit
> and/or 
> code comments where a brief explanation of decoupled mmio will be.
> 
> > 
> > > 
> > > 
> > > Would it be worth returning some fixed value in the case of a
> > > timeout?
> > > Might be better than random stuff? (applies in the 64-bit read
> > > case)
> > This is same as forcewake implementation. If we change it, what
> > would 
> > be the appropriate fixed value to be returned?
> Another good point. In that case I suppose it doesn't matter so can 
> leave it like it was. It can only theoretically affect 64-bit reads,
> yes?
> 
> > 
> > > 
> > > 
> > > > 
> > > > +    if (operation == GEN9_DECOUPLED_OP_READ)
> > > > +        *ptr_data = __raw_i915_read32(dev_priv,
> > > > +                GEN9_DECOUPLED_REG0_DW0);
> > > > +}
> > > > +
> > > >   #define GEN2_READ_HEADER(x) \
> > > >       u##x val = 0; \
> > > >       assert_rpm_wakelock_held(dev_priv);
> > > > @@ -892,6 +928,20 @@ static inline void
> > > > __force_wake_auto(struct
> > > > drm_i915_private *dev_priv,
> > > >           dev_priv->uncore.funcs.force_wake_get(dev_priv,
> > > > fw_domains);
> > > >   }
> > > >   +static inline bool __is_forcewake_active(struct
> > > > drm_i915_private
> > > > *dev_priv,
> > > > +                     enum forcewake_domains fw_domains)
> > > > +{
> > > > +    struct intel_uncore_forcewake_domain *domain;
> > > > +
> > > > +    /* Ideally GCC would be constant-fold and eliminate this
> > > > loop */
> > > > +    for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
> > > > +        if (domain->wake_count)
> > > > +            fw_domains &= ~domain->mask;
> > > > +    }
> > > > +
> > > > +    return fw_domains ? 0 : 1;
> > > > +}
> > > > +
> > > >   #define __gen6_read(x) \
> > > >   static u##x \
> > > >   gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t
> > > > reg, bool
> > > > trace) { \
> > > > @@ -940,6 +990,37 @@ gen9_read##x(struct drm_i915_private
> > > > *dev_priv,
> > > > i915_reg_t reg, bool trace) { \
> > > >       GEN6_READ_FOOTER; \
> > > >   }
> > > >   +#define __gen9_decoupled_read(x) \
> > > > +static u##x \
> > > > +gen9_decoupled_read##x(struct drm_i915_private *dev_priv,
> > > > i915_reg_t
> > > > reg, bool trace) { \
> > > > +    enum forcewake_domains fw_engine; \
> > > fw_engines
> > > 
> > > > 
> > > > +    GEN6_READ_HEADER(x); \
> > > > +    fw_engine = __gen9_reg_read_fw_domains(offset); \
> > > > +    if (fw_engine && x%32 == 0) { \
> > > > +        if (__is_forcewake_active(dev_priv, fw_engine)) \
> > > > +            __raw_i915_write##x(dev_priv, reg, val); \
> > > Write in the read macro, I don't understand!?
> > typo, I will fix it.
> > > 
> > > 
> > > Also, would a single mmio read call be possible, something like
> > > below?
> > > 
> > > if (x % 32 || !fw_engines || __is_forcewake_active()) {
> > >     if (fw_engines)
> > >         __force_wake_auto
> > >      __raw_i915_read
> > > } else {
> > >     ... decoupled mmio loop ...
> > > }
> > > 
> > > I might have made an oversight, no guarantees that I haven't. :)
> > Looks fine. Will test this
> Cool, the only thing which would still bug me here is the verboseness
> of 
> __is_forcewake_active but I have no smart ideas for that at the
> moment. 
> Perhaps keeping a bitmask of active domains in the core? Could be
> worth 
> it if for nothing for elegance.
> 
> Regards,
> 
> Tvrtko
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4dd307e..065247b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -558,6 +558,16 @@  enum forcewake_domains {
 #define FW_REG_READ  (1)
 #define FW_REG_WRITE (2)
 
+enum power_domains {
+	GEN9_DECOUPLED_PD_BLITTER = 0,
+	GEN9_DECOUPLED_PD_RENDER,
+	GEN9_DECOUPLED_PD_MEDIA,
+	GEN9_DECOUPLED_PD_ALL
+};
+
+#define GEN9_DECOUPLED_OP_WRITE (0)
+#define GEN9_DECOUPLED_OP_READ (1)
+
 enum forcewake_domains
 intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
 			       i915_reg_t reg, unsigned int op);
@@ -2854,6 +2864,7 @@  struct drm_i915_cmd_table {
 #define GT_FREQUENCY_MULTIPLIER 50
 #define GEN9_FREQ_SCALER 3
 
+#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) && IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
 #include "i915_trace.h"
 
 static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 70d9616..be59803 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7394,6 +7394,13 @@  enum {
 #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
 #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
 
+/* Decoupled MMIO register pair for kernel driver */
+#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)
+#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)
+#define GEN9_DECOUPLED_DW1_GO			(1<<31)
+#define GEN9_DECOUPLED_PD_SHIFT			28
+#define GEN9_DECOUPLED_OP_SHIFT			24
+
 /* Per-pipe DDI Function Control */
 #define _TRANS_DDI_FUNC_CTL_A		0x60400
 #define _TRANS_DDI_FUNC_CTL_B		0x61400
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index a9b6c93..06fffba 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -816,6 +816,42 @@  unclaimed_reg_debug(struct drm_i915_private *dev_priv,
 	__unclaimed_reg_debug(dev_priv, reg, read, before);
 }
 
+/*
+ * Decoupled MMIO access for only 1 DWORD
+ */
+static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
+					 uint32_t reg, u32 *ptr_data,
+					 enum forcewake_domains fw_engine, int operation)
+{
+	enum power_domains pd_engine;
+	u32 ctrl_reg_data = 0;
+
+	if (operation == GEN9_DECOUPLED_OP_WRITE)
+		__raw_i915_write32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0,
+				*ptr_data);
+
+	pd_engine = (fw_engine == (FORCEWAKE_RENDER || FORCEWAKE_BLITTER)) ? \
+		     !(fw_engine >> 1) : (fw_engine >> 1); \
+
+	ctrl_reg_data |= reg;
+	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
+	ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	if (wait_for_atomic((__raw_i915_read32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
+			FORCEWAKE_ACK_TIMEOUT_MS))
+		DRM_ERROR("Decoupled MMIO wait timed out\n");
+
+	if (operation == GEN9_DECOUPLED_OP_READ)
+		*ptr_data = __raw_i915_read32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0);
+}
+
 #define GEN2_READ_HEADER(x) \
 	u##x val = 0; \
 	assert_rpm_wakelock_held(dev_priv);
@@ -892,6 +928,20 @@  static inline void __force_wake_auto(struct drm_i915_private *dev_priv,
 		dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains);
 }
 
+static inline bool __is_forcewake_active(struct drm_i915_private *dev_priv,
+					 enum forcewake_domains fw_domains)
+{
+	struct intel_uncore_forcewake_domain *domain;
+
+	/* Ideally GCC would be constant-fold and eliminate this loop */
+	for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
+		if (domain->wake_count)
+			fw_domains &= ~domain->mask;
+	}
+
+	return fw_domains ? 0 : 1;
+}
+
 #define __gen6_read(x) \
 static u##x \
 gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
@@ -940,6 +990,37 @@  gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
 	GEN6_READ_FOOTER; \
 }
 
+#define __gen9_decoupled_read(x) \
+static u##x \
+gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
+	enum forcewake_domains fw_engine; \
+	GEN6_READ_HEADER(x); \
+	fw_engine = __gen9_reg_read_fw_domains(offset); \
+	if (fw_engine && x%32 == 0) { \
+		if (__is_forcewake_active(dev_priv, fw_engine)) \
+			__raw_i915_write##x(dev_priv, reg, val); \
+		else { \
+			unsigned i; \
+			u32 *ptr_data = (u32 *) &val; \
+			for (i = 0; i < x/32; i++) \
+				__gen9_decoupled_mmio_access(dev_priv, \
+						(offset + i*4), \
+						ptr_data + i, \
+						fw_engine, \
+						GEN9_DECOUPLED_OP_READ); \
+		} \
+	} else { \
+		if (fw_engine) \
+			__force_wake_auto(dev_priv, fw_engine); \
+		val = __raw_i915_read##x(dev_priv, reg); \
+	} \
+	GEN6_READ_FOOTER; \
+}
+
+__gen9_decoupled_read(8)
+__gen9_decoupled_read(16)
+__gen9_decoupled_read(32)
+__gen9_decoupled_read(64)
 __gen9_read(8)
 __gen9_read(16)
 __gen9_read(32)
@@ -1107,6 +1188,34 @@  gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
 	GEN6_WRITE_FOOTER; \
 }
 
+#define __gen9_decoupled_write(x) \
+static void \
+gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
+		bool trace) { \
+	enum forcewake_domains fw_engine; \
+	GEN6_WRITE_HEADER; \
+	fw_engine = __gen9_reg_write_fw_domains(offset); \
+	if (fw_engine && x == 32) { \
+		u32 *ptr_data = (u32 *) &val; \
+		if (__is_forcewake_active(dev_priv, fw_engine)) \
+			__raw_i915_write##x(dev_priv, reg, val); \
+		else \
+			__gen9_decoupled_mmio_access(dev_priv, \
+				offset, \
+				ptr_data, \
+				fw_engine, \
+				GEN9_DECOUPLED_OP_WRITE); \
+	} else { \
+		if (fw_engine) \
+			__force_wake_auto(dev_priv, fw_engine); \
+		__raw_i915_write##x(dev_priv, reg, val); \
+	} \
+	GEN6_WRITE_FOOTER; \
+}
+
+__gen9_decoupled_write(8)
+__gen9_decoupled_write(16)
+__gen9_decoupled_write(32)
 __gen9_write(8)
 __gen9_write(16)
 __gen9_write(32)
@@ -1328,8 +1437,13 @@  void intel_uncore_init(struct drm_i915_private *dev_priv)
 	switch (INTEL_INFO(dev_priv)->gen) {
 	default:
 	case 9:
-		ASSIGN_WRITE_MMIO_VFUNCS(gen9);
-		ASSIGN_READ_MMIO_VFUNCS(gen9);
+		if (HAS_DECOUPLED_MMIO(dev_priv)) {
+			ASSIGN_WRITE_MMIO_VFUNCS(gen9_decoupled);
+			ASSIGN_READ_MMIO_VFUNCS(gen9_decoupled);
+		} else {
+			ASSIGN_WRITE_MMIO_VFUNCS(gen9);
+			ASSIGN_READ_MMIO_VFUNCS(gen9);
+		}
 		break;
 	case 8:
 		if (IS_CHERRYVIEW(dev_priv)) {