diff mbox

[v3] drm/i915/bxt: Broxton decoupled MMIO

Message ID 1475595966-14754-1-git-send-email-praveen.paneri@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Praveen Paneri Oct. 4, 2016, 3:46 p.m. UTC
Decoupled MMIO is an alternative way to access forcewake domain
registers, which requires less cycles for a single read/write and
avoids frequent software forcewake.
This certainly gives advantage over the forcewake as this new
mechanism “decouples” CPU cycles and allow them to complete even
when GT is in a CPD (frequency change) or C6 state.

This can co-exist with forcewake and we will continue to use forcewake
as appropriate. E.g. 64-bit register writes to avoid writing 2 dwords
separately and land into funny situations.

v2:
- Moved platform check out of the function and got rid of duplicate
 functions to find out decoupled power domain (Chris)
- Added a check for forcewake already held and skipped decoupled
 access (Chris)
- Skipped writing 64 bit registers through decoupled MMIO (Chris)

v3:
- Improved commit message with more info on decoupled mmio (Tvrtko)
- Changed decoupled operation to enum and used u32 instead of
 uint_32 data type for register offset (Tvrtko)
- Moved HAS_DECOUPLED_MMIO to device info (Tvrtko)
- Added lookup table for converting fw_engine to pd_engine (Tvrtko)
- Improved __gen9_decoupled_read and __gen9_decoupled_write routines (Tvrtko)

Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h     |  18 +++++-
 drivers/gpu/drm/i915/i915_pci.c     |   1 +
 drivers/gpu/drm/i915/i915_reg.h     |   7 +++
 drivers/gpu/drm/i915/intel_uncore.c | 113 ++++++++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 1 deletion(-)

Comments

Rodrigo Vivi Oct. 4, 2016, 5:43 p.m. UTC | #1
Is this still an embargoed feature? why?

With Apollolake out there we need to work to get permission to upstream
this feature already and post it to intel-gfx@lists.freedesktop.org in
order to get this merged upstream.

A decoupled version for BXT stayed in the internal for so long time and
caused so much trouble on rebase that end up removed.

So, could you please work to get this approved for upstream?

Thanks,
Rodrigo.

On Tue, 2016-10-04 at 21:16 +0530, Praveen Paneri wrote:
> Decoupled MMIO is an alternative way to access forcewake domain

> registers, which requires less cycles for a single read/write and

> avoids frequent software forcewake.

> This certainly gives advantage over the forcewake as this new

> mechanism “decouples” CPU cycles and allow them to complete even

> when GT is in a CPD (frequency change) or C6 state.

> 

> This can co-exist with forcewake and we will continue to use forcewake

> as appropriate. E.g. 64-bit register writes to avoid writing 2 dwords

> separately and land into funny situations.

> 

> v2:

> - Moved platform check out of the function and got rid of duplicate

>  functions to find out decoupled power domain (Chris)

> - Added a check for forcewake already held and skipped decoupled

>  access (Chris)

> - Skipped writing 64 bit registers through decoupled MMIO (Chris)

> 

> v3:

> - Improved commit message with more info on decoupled mmio (Tvrtko)

> - Changed decoupled operation to enum and used u32 instead of

>  uint_32 data type for register offset (Tvrtko)

> - Moved HAS_DECOUPLED_MMIO to device info (Tvrtko)

> - Added lookup table for converting fw_engine to pd_engine (Tvrtko)

> - Improved __gen9_decoupled_read and __gen9_decoupled_write routines (Tvrtko)

> 

> Signed-off-by: Zhe Wang <zhe1.wang@intel.com>

> Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>

> ---

>  drivers/gpu/drm/i915/i915_drv.h     |  18 +++++-

>  drivers/gpu/drm/i915/i915_pci.c     |   1 +

>  drivers/gpu/drm/i915/i915_reg.h     |   7 +++

>  drivers/gpu/drm/i915/intel_uncore.c | 113 ++++++++++++++++++++++++++++++++++++

>  4 files changed, 138 insertions(+), 1 deletion(-)

> 

> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

> index f8c66ee..bfdd55a 100644

> --- a/drivers/gpu/drm/i915/i915_drv.h

> +++ b/drivers/gpu/drm/i915/i915_drv.h

> @@ -559,6 +559,18 @@ enum forcewake_domains {

>  #define FW_REG_READ  (1)

>  #define FW_REG_WRITE (2)

>  

> +enum decoupled_power_domains {

> +	GEN9_DECOUPLED_PD_BLITTER = 0,

> +	GEN9_DECOUPLED_PD_RENDER,

> +	GEN9_DECOUPLED_PD_MEDIA,

> +	GEN9_DECOUPLED_PD_ALL

> +};

> +

> +enum decoupled_ops {

> +	GEN9_DECOUPLED_OP_WRITE = 0,

> +	GEN9_DECOUPLED_OP_READ

> +};

> +

>  enum forcewake_domains

>  intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,

>  			       i915_reg_t reg, unsigned int op);

> @@ -690,7 +702,8 @@ struct intel_csr {

>  	func(has_snoop) sep \

>  	func(has_ddi) sep \

>  	func(has_fpga_dbg) sep \

> -	func(has_pooled_eu)

> +	func(has_pooled_eu) sep \

> +	func(has_decoupled_mmio)

>  

>  #define DEFINE_FLAG(name) u8 name:1

>  #define SEP_SEMICOLON ;

> @@ -2869,6 +2882,9 @@ struct drm_i915_cmd_table {

>  #define GT_FREQUENCY_MULTIPLIER 50

>  #define GEN9_FREQ_SCALER 3

>  

> +#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \

> +		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))

> +

>  #include "i915_trace.h"

>  

>  static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)

> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c

> index 31e6edd..5c56c0c 100644

> --- a/drivers/gpu/drm/i915/i915_pci.c

> +++ b/drivers/gpu/drm/i915/i915_pci.c

> @@ -360,6 +360,7 @@ static const struct intel_device_info intel_broxton_info = {

>  	.has_hw_contexts = 1,

>  	.has_logical_ring_contexts = 1,

>  	.has_guc = 1,

> +	.has_decoupled_mmio = 1,

>  	.ddb_size = 512,

>  	GEN_DEFAULT_PIPEOFFSETS,

>  	IVB_CURSOR_OFFSETS,

> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h

> index 8d44cee..bf7b4c9 100644

> --- a/drivers/gpu/drm/i915/i915_reg.h

> +++ b/drivers/gpu/drm/i915/i915_reg.h

> @@ -7398,6 +7398,13 @@ enum {

>  #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)

>  #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)

>  

> +/* Decoupled MMIO register pair for kernel driver */

> +#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)

> +#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)

> +#define GEN9_DECOUPLED_DW1_GO			(1<<31)

> +#define GEN9_DECOUPLED_PD_SHIFT			28

> +#define GEN9_DECOUPLED_OP_SHIFT			24

> +

>  /* Per-pipe DDI Function Control */

>  #define _TRANS_DDI_FUNC_CTL_A		0x60400

>  #define _TRANS_DDI_FUNC_CTL_B		0x61400

> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c

> index e2b188d..0af602e 100644

> --- a/drivers/gpu/drm/i915/intel_uncore.c

> +++ b/drivers/gpu/drm/i915/intel_uncore.c

> @@ -831,6 +831,72 @@ unclaimed_reg_debug(struct drm_i915_private *dev_priv,

>  	__unclaimed_reg_debug(dev_priv, reg, read, before);

>  }

>  

> +static const enum decoupled_power_domains fw2dpd_engine[] = {

> +	GEN9_DECOUPLED_PD_RENDER,

> +	GEN9_DECOUPLED_PD_BLITTER,

> +	GEN9_DECOUPLED_PD_ALL,

> +	GEN9_DECOUPLED_PD_MEDIA,

> +	GEN9_DECOUPLED_PD_ALL,

> +	GEN9_DECOUPLED_PD_ALL,

> +	GEN9_DECOUPLED_PD_ALL

> +};

> +

> +/*

> + * Decoupled MMIO access for only 1 DWORD

> + */

> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,

> +					 u32 reg,

> +					 enum forcewake_domains fw_engine,

> +					 enum decoupled_ops operation)

> +{

> +	enum decoupled_power_domains dpd_engine;

> +	u32 ctrl_reg_data = 0;

> +

> +	dpd_engine = fw2dpd_engine[fw_engine - 1];

> +

> +	ctrl_reg_data |= reg;

> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);

> +	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);

> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);

> +

> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;

> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);

> +

> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,

> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,

> +			FORCEWAKE_ACK_TIMEOUT_MS))

> +		DRM_ERROR("Decoupled MMIO wait timed out\n");

> +}

> +

> +static inline u32 __gen9_decoupled_mmio_read(struct drm_i915_private *dev_priv,

> +                                      u32 reg,

> +                                      enum forcewake_domains fw_engine)

> +{

> +	__gen9_decoupled_mmio_access(dev_priv,

> +			reg,

> +			fw_engine,

> +			GEN9_DECOUPLED_OP_READ);

> +

> +	return __raw_i915_read32(dev_priv,

> +			GEN9_DECOUPLED_REG0_DW0);

> +}

> +

> +static inline void __gen9_decoupled_mmio_write(struct drm_i915_private *dev_priv,

> +                                      u32 reg, u32 data,

> +                                      enum forcewake_domains fw_engine)

> +{

> +

> +	__raw_i915_write32(dev_priv,

> +			GEN9_DECOUPLED_REG0_DW0,

> +			data);

> +

> +	__gen9_decoupled_mmio_access(dev_priv,

> +			reg,

> +			fw_engine,

> +			GEN9_DECOUPLED_OP_WRITE);

> +}

> +

> +

>  #define GEN2_READ_HEADER(x) \

>  	u##x val = 0; \

>  	assert_rpm_wakelock_held(dev_priv);

> @@ -935,6 +1001,27 @@ fwtable_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) {

>  	GEN6_READ_FOOTER; \

>  }

>  

> +#define __gen9_decoupled_read(x) \

> +static u##x \

> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \

> +	enum forcewake_domains fw_engine; \

> +	GEN6_READ_HEADER(x); \

> +	fw_engine = __fwtable_reg_read_fw_domains(offset); \

> +	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) { \

> +		val = __raw_i915_read##x(dev_priv, reg); \

> +	} else { \

> +		unsigned i; \

> +		u32 *ptr_data = (u32 *) &val; \

> +		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr_data++) \

> +			*ptr_data = __gen9_decoupled_mmio_read(dev_priv, \

> +						     offset, \

> +						     fw_engine); \

> +	} \

> +	GEN6_READ_FOOTER; \

> +}

> +

> +__gen9_decoupled_read(32)

> +__gen9_decoupled_read(64)

>  __fwtable_read(8)

>  __fwtable_read(16)

>  __fwtable_read(32)

> @@ -1064,6 +1151,24 @@ fwtable_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, bo

>  	GEN6_WRITE_FOOTER; \

>  }

>  

> +#define __gen9_decoupled_write(x) \

> +static void \

> +gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \

> +		bool trace) { \

> +	enum forcewake_domains fw_engine; \

> +	GEN6_WRITE_HEADER; \

> +	fw_engine = __fwtable_reg_write_fw_domains(offset); \

> +	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) \

> +		__raw_i915_write##x(dev_priv, reg, val); \

> +	else \

> +		__gen9_decoupled_mmio_write(dev_priv, \

> +					     offset, \

> +					     val, \

> +					     fw_engine); \

> +	GEN6_WRITE_FOOTER; \

> +}

> +

> +__gen9_decoupled_write(32)

>  __fwtable_write(8)

>  __fwtable_write(16)

>  __fwtable_write(32)

> @@ -1287,6 +1392,14 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)

>  		ASSIGN_FW_DOMAINS_TABLE(__gen9_fw_ranges);

>  		ASSIGN_WRITE_MMIO_VFUNCS(fwtable);

>  		ASSIGN_READ_MMIO_VFUNCS(fwtable);

> +		if (HAS_DECOUPLED_MMIO(dev_priv)) {

> +			dev_priv->uncore.funcs.mmio_readl =

> +						gen9_decoupled_read32;

> +			dev_priv->uncore.funcs.mmio_readq =

> +						gen9_decoupled_read64;

> +			dev_priv->uncore.funcs.mmio_writel =

> +						gen9_decoupled_write32;

> +		}

>  		break;

>  	case 8:

>  		if (IS_CHERRYVIEW(dev_priv)) {
Chris Wilson Oct. 4, 2016, 7:56 p.m. UTC | #2
On Tue, Oct 04, 2016 at 09:16:06PM +0530, Praveen Paneri wrote:
> +#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \
> +		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))

Edit dev_priv->info.has_decoupled_mmio on init.

> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
> +					 u32 reg,
> +					 enum forcewake_domains fw_engine,
> +					 enum decoupled_ops operation)
> +{
> +	enum decoupled_power_domains dpd_engine;
> +	u32 ctrl_reg_data = 0;
> +
> +	dpd_engine = fw2dpd_engine[fw_engine - 1];

	enum decoupled_power_domains dpd = fw2dpd_engine[fw_engine - 1];

enum decoupled_power_domain

And don't call it fw_engine. fw_domain, if you must.

> +
> +	ctrl_reg_data |= reg;
> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
> +	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
> +			FORCEWAKE_ACK_TIMEOUT_MS))
> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
> +}
> +
> +static inline u32 __gen9_decoupled_mmio_read(struct drm_i915_private *dev_priv,
> +                                      u32 reg,
> +                                      enum forcewake_domains fw)

__gen9_decoupeld_mmio_read32()

> +{
> +	__gen9_decoupled_mmio_access(dev_priv,
> +			reg,
> +			fw_engine,
> +			GEN9_DECOUPLED_OP_READ);

	__gen9_decoupled_mmio_access(dev_priv, reg, fw, GEN9_DECOUPLED_OP_READ);

> +
> +	return __raw_i915_read32(dev_priv,
> +			GEN9_DECOUPLED_REG0_DW0);

Everywhere! Please be careful with alignment.

> +#define __gen9_decoupled_read(x) \
> +static u##x \
> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
> +	enum forcewake_domains fw_engine; \
> +	GEN6_READ_HEADER(x); \
> +	fw_engine = __fwtable_reg_read_fw_domains(offset); \
> +	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) { \
> +		val = __raw_i915_read##x(dev_priv, reg); \
> +	} else { \
> +		unsigned i; \
> +		u32 *ptr_data = (u32 *) &val; \
> +		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr_data++) \
> +			*ptr_data = __gen9_decoupled_mmio_read(dev_priv, \
> +						     offset, \
> +						     fw_engine); \
> +	} \
> +	GEN6_READ_FOOTER; \
> +}

Reverse it,

	if (domain & ~dev_priv->uncore.fw_domains_active) {
		u32 *ptr = (u32 *)&val;
		unsigned i;

		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr++)
			*ptr = __gen9_decoupled_mmio_read32(dev_priv, offset, domain);
	} else {
		val = __raw_i915_read##x(dev_priv, reg);
	}
Praveen Paneri Oct. 5, 2016, 3:17 a.m. UTC | #3
Thanks Chris for the review. Will fix these and resend.

~Praveen

On Wednesday 05 October 2016 01:26 AM, Chris Wilson wrote:
> On Tue, Oct 04, 2016 at 09:16:06PM +0530, Praveen Paneri wrote:
>> +#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \
>> +		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))
>
> Edit dev_priv->info.has_decoupled_mmio on init.
>
>> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
>> +					 u32 reg,
>> +					 enum forcewake_domains fw_engine,
>> +					 enum decoupled_ops operation)
>> +{
>> +	enum decoupled_power_domains dpd_engine;
>> +	u32 ctrl_reg_data = 0;
>> +
>> +	dpd_engine = fw2dpd_engine[fw_engine - 1];
>
> 	enum decoupled_power_domains dpd = fw2dpd_engine[fw_engine - 1];
>
> enum decoupled_power_domain
>
> And don't call it fw_engine. fw_domain, if you must.
>
>> +
>> +	ctrl_reg_data |= reg;
>> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>> +	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
>> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>> +			FORCEWAKE_ACK_TIMEOUT_MS))
>> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
>> +}
>> +
>> +static inline u32 __gen9_decoupled_mmio_read(struct drm_i915_private *dev_priv,
>> +                                      u32 reg,
>> +                                      enum forcewake_domains fw)
>
> __gen9_decoupeld_mmio_read32()
>
>> +{
>> +	__gen9_decoupled_mmio_access(dev_priv,
>> +			reg,
>> +			fw_engine,
>> +			GEN9_DECOUPLED_OP_READ);
>
> 	__gen9_decoupled_mmio_access(dev_priv, reg, fw, GEN9_DECOUPLED_OP_READ);
>
>> +
>> +	return __raw_i915_read32(dev_priv,
>> +			GEN9_DECOUPLED_REG0_DW0);
>
> Everywhere! Please be careful with alignment.
>
>> +#define __gen9_decoupled_read(x) \
>> +static u##x \
>> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>> +	enum forcewake_domains fw_engine; \
>> +	GEN6_READ_HEADER(x); \
>> +	fw_engine = __fwtable_reg_read_fw_domains(offset); \
>> +	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) { \
>> +		val = __raw_i915_read##x(dev_priv, reg); \
>> +	} else { \
>> +		unsigned i; \
>> +		u32 *ptr_data = (u32 *) &val; \
>> +		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr_data++) \
>> +			*ptr_data = __gen9_decoupled_mmio_read(dev_priv, \
>> +						     offset, \
>> +						     fw_engine); \
>> +	} \
>> +	GEN6_READ_FOOTER; \
>> +}
>
> Reverse it,
>
> 	if (domain & ~dev_priv->uncore.fw_domains_active) {
> 		u32 *ptr = (u32 *)&val;
> 		unsigned i;
>
> 		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr++)
> 			*ptr = __gen9_decoupled_mmio_read32(dev_priv, offset, domain);
> 	} else {
> 		val = __raw_i915_read##x(dev_priv, reg);
> 	}
>
Praveen Paneri Oct. 5, 2016, 6:24 a.m. UTC | #4
Hi Chris,

On Wednesday 05 October 2016 01:26 AM, Chris Wilson wrote:
> On Tue, Oct 04, 2016 at 09:16:06PM +0530, Praveen Paneri wrote:
>> +#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \
>> +		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))
>
> Edit dev_priv->info.has_decoupled_mmio on init.
Can I add this check directly into __intel_uncore_early_sanitize(), like 
below?

@@ -419,6 +419,10 @@ static void __intel_uncore_early_sanitize(struct 
drm_i915_private *dev_priv,
                                    GT_FIFO_CTL_RC6_POLICY_STALL);
         }

+       /* Enable Decoupled MMIO only on BXT C stepping onwards */
+       if (!IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
+               info->has_decoupled_mmio = 0;
+
         intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
  }

>
>> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
>> +					 u32 reg,
>> +					 enum forcewake_domains fw_engine,
>> +					 enum decoupled_ops operation)
>> +{
>> +	enum decoupled_power_domains dpd_engine;
>> +	u32 ctrl_reg_data = 0;
>> +
>> +	dpd_engine = fw2dpd_engine[fw_engine - 1];
>
> 	enum decoupled_power_domains dpd = fw2dpd_engine[fw_engine - 1];
>
> enum decoupled_power_domain
>
> And don't call it fw_engine. fw_domain, if you must.
I can change it but related existing code still uses fw_engine. Wouldn't 
it look out of the place?

Thanks,
Praveen
>
>> +
>> +	ctrl_reg_data |= reg;
>> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>> +	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
>> +
>> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
>> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>> +			FORCEWAKE_ACK_TIMEOUT_MS))
>> +		DRM_ERROR("Decoupled MMIO wait timed out\n");
>> +}
>> +
>> +static inline u32 __gen9_decoupled_mmio_read(struct drm_i915_private *dev_priv,
>> +                                      u32 reg,
>> +                                      enum forcewake_domains fw)
>
> __gen9_decoupeld_mmio_read32()
>
>> +{
>> +	__gen9_decoupled_mmio_access(dev_priv,
>> +			reg,
>> +			fw_engine,
>> +			GEN9_DECOUPLED_OP_READ);
>
> 	__gen9_decoupled_mmio_access(dev_priv, reg, fw, GEN9_DECOUPLED_OP_READ);
>
>> +
>> +	return __raw_i915_read32(dev_priv,
>> +			GEN9_DECOUPLED_REG0_DW0);
>
> Everywhere! Please be careful with alignment.
>
>> +#define __gen9_decoupled_read(x) \
>> +static u##x \
>> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
>> +	enum forcewake_domains fw_engine; \
>> +	GEN6_READ_HEADER(x); \
>> +	fw_engine = __fwtable_reg_read_fw_domains(offset); \
>> +	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) { \
>> +		val = __raw_i915_read##x(dev_priv, reg); \
>> +	} else { \
>> +		unsigned i; \
>> +		u32 *ptr_data = (u32 *) &val; \
>> +		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr_data++) \
>> +			*ptr_data = __gen9_decoupled_mmio_read(dev_priv, \
>> +						     offset, \
>> +						     fw_engine); \
>> +	} \
>> +	GEN6_READ_FOOTER; \
>> +}
>
> Reverse it,
>
> 	if (domain & ~dev_priv->uncore.fw_domains_active) {
> 		u32 *ptr = (u32 *)&val;
> 		unsigned i;
>
> 		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr++)
> 			*ptr = __gen9_decoupled_mmio_read32(dev_priv, offset, domain);
> 	} else {
> 		val = __raw_i915_read##x(dev_priv, reg);
> 	}
>
Tvrtko Ursulin Oct. 5, 2016, 1:50 p.m. UTC | #5
On 04/10/2016 16:46, Praveen Paneri wrote:
> Decoupled MMIO is an alternative way to access forcewake domain
> registers, which requires less cycles for a single read/write and
> avoids frequent software forcewake.
> This certainly gives advantage over the forcewake as this new
> mechanism “decouples” CPU cycles and allow them to complete even
> when GT is in a CPD (frequency change) or C6 state.
>
> This can co-exist with forcewake and we will continue to use forcewake
> as appropriate. E.g. 64-bit register writes to avoid writing 2 dwords
> separately and land into funny situations.
>
> v2:
> - Moved platform check out of the function and got rid of duplicate
>   functions to find out decoupled power domain (Chris)
> - Added a check for forcewake already held and skipped decoupled
>   access (Chris)
> - Skipped writing 64 bit registers through decoupled MMIO (Chris)
>
> v3:
> - Improved commit message with more info on decoupled mmio (Tvrtko)
> - Changed decoupled operation to enum and used u32 instead of
>   uint_32 data type for register offset (Tvrtko)
> - Moved HAS_DECOUPLED_MMIO to device info (Tvrtko)
> - Added lookup table for converting fw_engine to pd_engine (Tvrtko)
> - Improved __gen9_decoupled_read and __gen9_decoupled_write routines (Tvrtko)
>
> Signed-off-by: Zhe Wang <zhe1.wang@intel.com>
> Signed-off-by: Praveen Paneri <praveen.paneri@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h     |  18 +++++-
>   drivers/gpu/drm/i915/i915_pci.c     |   1 +
>   drivers/gpu/drm/i915/i915_reg.h     |   7 +++
>   drivers/gpu/drm/i915/intel_uncore.c | 113 ++++++++++++++++++++++++++++++++++++
>   4 files changed, 138 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f8c66ee..bfdd55a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -559,6 +559,18 @@ enum forcewake_domains {
>   #define FW_REG_READ  (1)
>   #define FW_REG_WRITE (2)
>   
> +enum decoupled_power_domains {
> +	GEN9_DECOUPLED_PD_BLITTER = 0,
> +	GEN9_DECOUPLED_PD_RENDER,
> +	GEN9_DECOUPLED_PD_MEDIA,
> +	GEN9_DECOUPLED_PD_ALL
> +};
> +
> +enum decoupled_ops {
> +	GEN9_DECOUPLED_OP_WRITE = 0,
> +	GEN9_DECOUPLED_OP_READ
> +};
> +
>   enum forcewake_domains
>   intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>   			       i915_reg_t reg, unsigned int op);
> @@ -690,7 +702,8 @@ struct intel_csr {
>   	func(has_snoop) sep \
>   	func(has_ddi) sep \
>   	func(has_fpga_dbg) sep \
> -	func(has_pooled_eu)
> +	func(has_pooled_eu) sep \
> +	func(has_decoupled_mmio)
>   
>   #define DEFINE_FLAG(name) u8 name:1
>   #define SEP_SEMICOLON ;
> @@ -2869,6 +2882,9 @@ struct drm_i915_cmd_table {
>   #define GT_FREQUENCY_MULTIPLIER 50
>   #define GEN9_FREQ_SCALER 3
>   
> +#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \
> +		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))
> +
>   #include "i915_trace.h"
>   
>   static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index 31e6edd..5c56c0c 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -360,6 +360,7 @@ static const struct intel_device_info intel_broxton_info = {
>   	.has_hw_contexts = 1,
>   	.has_logical_ring_contexts = 1,
>   	.has_guc = 1,
> +	.has_decoupled_mmio = 1,
>   	.ddb_size = 512,
>   	GEN_DEFAULT_PIPEOFFSETS,
>   	IVB_CURSOR_OFFSETS,
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 8d44cee..bf7b4c9 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -7398,6 +7398,13 @@ enum {
>   #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
>   #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
>   
> +/* Decoupled MMIO register pair for kernel driver */
> +#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)
> +#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)
> +#define GEN9_DECOUPLED_DW1_GO			(1<<31)
> +#define GEN9_DECOUPLED_PD_SHIFT			28
> +#define GEN9_DECOUPLED_OP_SHIFT			24
> +
>   /* Per-pipe DDI Function Control */
>   #define _TRANS_DDI_FUNC_CTL_A		0x60400
>   #define _TRANS_DDI_FUNC_CTL_B		0x61400
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index e2b188d..0af602e 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -831,6 +831,72 @@ unclaimed_reg_debug(struct drm_i915_private *dev_priv,
>   	__unclaimed_reg_debug(dev_priv, reg, read, before);
>   }
>   
> +static const enum decoupled_power_domains fw2dpd_engine[] = {
> +	GEN9_DECOUPLED_PD_RENDER,
> +	GEN9_DECOUPLED_PD_BLITTER,
> +	GEN9_DECOUPLED_PD_ALL,
> +	GEN9_DECOUPLED_PD_MEDIA,
> +	GEN9_DECOUPLED_PD_ALL,
> +	GEN9_DECOUPLED_PD_ALL,
> +	GEN9_DECOUPLED_PD_ALL
> +};
> +
> +/*
> + * Decoupled MMIO access for only 1 DWORD
> + */
> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
> +					 u32 reg,
> +					 enum forcewake_domains fw_engine,
> +					 enum decoupled_ops operation)
> +{
> +	enum decoupled_power_domains dpd_engine;
> +	u32 ctrl_reg_data = 0;
> +
> +	dpd_engine = fw2dpd_engine[fw_engine - 1];
> +
> +	ctrl_reg_data |= reg;
> +	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
> +	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
> +	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
> +
> +	if (wait_for_atomic((__raw_i915_read32(dev_priv,
> +			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
> +			FORCEWAKE_ACK_TIMEOUT_MS))
> +

I asked about the timeout before. Is the forcewake ack timeout 
applicable for decoupled mmio or a better value should be used?

Also, do you have any numbers on how fast decoupled access typically is? 
In other words, how does it compare with existing code for accesses not 
done under an explicit forcewake get? Is a seqeunce of I915_READs for 
example faster with decoupled mmio than under the current scheme of 
automatic forcewake grab/release?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f8c66ee..bfdd55a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -559,6 +559,18 @@  enum forcewake_domains {
 #define FW_REG_READ  (1)
 #define FW_REG_WRITE (2)
 
+enum decoupled_power_domains {
+	GEN9_DECOUPLED_PD_BLITTER = 0,
+	GEN9_DECOUPLED_PD_RENDER,
+	GEN9_DECOUPLED_PD_MEDIA,
+	GEN9_DECOUPLED_PD_ALL
+};
+
+enum decoupled_ops {
+	GEN9_DECOUPLED_OP_WRITE = 0,
+	GEN9_DECOUPLED_OP_READ
+};
+
 enum forcewake_domains
 intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
 			       i915_reg_t reg, unsigned int op);
@@ -690,7 +702,8 @@  struct intel_csr {
 	func(has_snoop) sep \
 	func(has_ddi) sep \
 	func(has_fpga_dbg) sep \
-	func(has_pooled_eu)
+	func(has_pooled_eu) sep \
+	func(has_decoupled_mmio)
 
 #define DEFINE_FLAG(name) u8 name:1
 #define SEP_SEMICOLON ;
@@ -2869,6 +2882,9 @@  struct drm_i915_cmd_table {
 #define GT_FREQUENCY_MULTIPLIER 50
 #define GEN9_FREQ_SCALER 3
 
+#define HAS_DECOUPLED_MMIO(dev) (INTEL_INFO(dev)->has_decoupled_mmio \
+		&& IS_BXT_REVID(dev, BXT_REVID_C0, REVID_FOREVER))
+
 #include "i915_trace.h"
 
 static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index 31e6edd..5c56c0c 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -360,6 +360,7 @@  static const struct intel_device_info intel_broxton_info = {
 	.has_hw_contexts = 1,
 	.has_logical_ring_contexts = 1,
 	.has_guc = 1,
+	.has_decoupled_mmio = 1,
 	.ddb_size = 512,
 	GEN_DEFAULT_PIPEOFFSETS,
 	IVB_CURSOR_OFFSETS,
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 8d44cee..bf7b4c9 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7398,6 +7398,13 @@  enum {
 #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
 #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
 
+/* Decoupled MMIO register pair for kernel driver */
+#define GEN9_DECOUPLED_REG0_DW0			_MMIO(0xF00)
+#define GEN9_DECOUPLED_REG0_DW1			_MMIO(0xF04)
+#define GEN9_DECOUPLED_DW1_GO			(1<<31)
+#define GEN9_DECOUPLED_PD_SHIFT			28
+#define GEN9_DECOUPLED_OP_SHIFT			24
+
 /* Per-pipe DDI Function Control */
 #define _TRANS_DDI_FUNC_CTL_A		0x60400
 #define _TRANS_DDI_FUNC_CTL_B		0x61400
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index e2b188d..0af602e 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -831,6 +831,72 @@  unclaimed_reg_debug(struct drm_i915_private *dev_priv,
 	__unclaimed_reg_debug(dev_priv, reg, read, before);
 }
 
+static const enum decoupled_power_domains fw2dpd_engine[] = {
+	GEN9_DECOUPLED_PD_RENDER,
+	GEN9_DECOUPLED_PD_BLITTER,
+	GEN9_DECOUPLED_PD_ALL,
+	GEN9_DECOUPLED_PD_MEDIA,
+	GEN9_DECOUPLED_PD_ALL,
+	GEN9_DECOUPLED_PD_ALL,
+	GEN9_DECOUPLED_PD_ALL
+};
+
+/*
+ * Decoupled MMIO access for only 1 DWORD
+ */
+static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
+					 u32 reg,
+					 enum forcewake_domains fw_engine,
+					 enum decoupled_ops operation)
+{
+	enum decoupled_power_domains dpd_engine;
+	u32 ctrl_reg_data = 0;
+
+	dpd_engine = fw2dpd_engine[fw_engine - 1];
+
+	ctrl_reg_data |= reg;
+	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
+	ctrl_reg_data |= (dpd_engine << GEN9_DECOUPLED_PD_SHIFT);
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	if (wait_for_atomic((__raw_i915_read32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
+			FORCEWAKE_ACK_TIMEOUT_MS))
+		DRM_ERROR("Decoupled MMIO wait timed out\n");
+}
+
+static inline u32 __gen9_decoupled_mmio_read(struct drm_i915_private *dev_priv,
+                                      u32 reg,
+                                      enum forcewake_domains fw_engine)
+{
+	__gen9_decoupled_mmio_access(dev_priv,
+			reg,
+			fw_engine,
+			GEN9_DECOUPLED_OP_READ);
+
+	return __raw_i915_read32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW0);
+}
+
+static inline void __gen9_decoupled_mmio_write(struct drm_i915_private *dev_priv,
+                                      u32 reg, u32 data,
+                                      enum forcewake_domains fw_engine)
+{
+
+	__raw_i915_write32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW0,
+			data);
+
+	__gen9_decoupled_mmio_access(dev_priv,
+			reg,
+			fw_engine,
+			GEN9_DECOUPLED_OP_WRITE);
+}
+
+
 #define GEN2_READ_HEADER(x) \
 	u##x val = 0; \
 	assert_rpm_wakelock_held(dev_priv);
@@ -935,6 +1001,27 @@  fwtable_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) {
 	GEN6_READ_FOOTER; \
 }
 
+#define __gen9_decoupled_read(x) \
+static u##x \
+gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
+	enum forcewake_domains fw_engine; \
+	GEN6_READ_HEADER(x); \
+	fw_engine = __fwtable_reg_read_fw_domains(offset); \
+	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) { \
+		val = __raw_i915_read##x(dev_priv, reg); \
+	} else { \
+		unsigned i; \
+		u32 *ptr_data = (u32 *) &val; \
+		for (i = 0; i < x/32; i++, offset += sizeof(u32), ptr_data++) \
+			*ptr_data = __gen9_decoupled_mmio_read(dev_priv, \
+						     offset, \
+						     fw_engine); \
+	} \
+	GEN6_READ_FOOTER; \
+}
+
+__gen9_decoupled_read(32)
+__gen9_decoupled_read(64)
 __fwtable_read(8)
 __fwtable_read(16)
 __fwtable_read(32)
@@ -1064,6 +1151,24 @@  fwtable_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, bo
 	GEN6_WRITE_FOOTER; \
 }
 
+#define __gen9_decoupled_write(x) \
+static void \
+gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
+		bool trace) { \
+	enum forcewake_domains fw_engine; \
+	GEN6_WRITE_HEADER; \
+	fw_engine = __fwtable_reg_write_fw_domains(offset); \
+	if (!fw_engine || !(fw_engine & ~dev_priv->uncore.fw_domains_active)) \
+		__raw_i915_write##x(dev_priv, reg, val); \
+	else \
+		__gen9_decoupled_mmio_write(dev_priv, \
+					     offset, \
+					     val, \
+					     fw_engine); \
+	GEN6_WRITE_FOOTER; \
+}
+
+__gen9_decoupled_write(32)
 __fwtable_write(8)
 __fwtable_write(16)
 __fwtable_write(32)
@@ -1287,6 +1392,14 @@  void intel_uncore_init(struct drm_i915_private *dev_priv)
 		ASSIGN_FW_DOMAINS_TABLE(__gen9_fw_ranges);
 		ASSIGN_WRITE_MMIO_VFUNCS(fwtable);
 		ASSIGN_READ_MMIO_VFUNCS(fwtable);
+		if (HAS_DECOUPLED_MMIO(dev_priv)) {
+			dev_priv->uncore.funcs.mmio_readl =
+						gen9_decoupled_read32;
+			dev_priv->uncore.funcs.mmio_readq =
+						gen9_decoupled_read64;
+			dev_priv->uncore.funcs.mmio_writel =
+						gen9_decoupled_write32;
+		}
 		break;
 	case 8:
 		if (IS_CHERRYVIEW(dev_priv)) {