Message ID | 1473139454-30627-1-git-send-email-praveen.paneri@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Sep 06, 2016 at 10:54:14AM +0530, Praveen Paneri wrote: > Decoupled MMIO is an alternative way to access forcewake domain > registers, which requires less cycles and avoids frequent software > forcewake. How about when forcewake is already held? You'll note that we still require irq-spinlocks so the mmio access is still not great. And we still will have to frequently take forcewake manually, apparently. Do you have any statistics to say that we do reduce grabing the fw wakelock and that the busywait you add instead is negligible. You are still using a 50ms timeout, so there is some doubt about "less cycles". > +/* > + * Decoupled MMIO access for only 1 DWORD > + */ > +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv, > + uint32_t reg, u32 *ptr_data, > + enum power_domains pd, int operation) > +{ > + u32 ctrl_reg_data = 0; > + > + if (operation == GEN9_DECOUPLED_OP_WRITE) > + __raw_i915_write32(dev_priv, > + GEN9_DECOUPLED_REG0_DW0, > + *ptr_data); > + > + ctrl_reg_data |= reg; > + ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT); > + ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT); > + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); > + > + ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO; > + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); > + > + if (wait_for_atomic((__raw_i915_read32(dev_priv, > + GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0, > + FORCEWAKE_ACK_TIMEOUT_MS)) > + DRM_ERROR("Decoupled MMIO wait timed out\n"); > + > + if (operation == GEN9_DECOUPLED_OP_READ) > + *ptr_data = __raw_i915_read32(dev_priv, > + GEN9_DECOUPLED_REG0_DW0); > +} > + > #define GEN2_READ_HEADER(x) \ > u##x val = 0; \ > assert_rpm_wakelock_held(dev_priv); > @@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ > static u##x \ > gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ > enum forcewake_domains fw_engine; \ > + enum power_domains pd_engine; \ > GEN6_READ_HEADER(x); \ > - fw_engine = __gen9_reg_read_fw_domains(offset); \ > - if (fw_engine) \ > - __force_wake_auto(dev_priv, fw_engine); \ > - val = __raw_i915_read##x(dev_priv, reg); \ > - GEN6_READ_FOOTER; \ > + pd_engine = __gen9_reg_read_power_domains(offset); \ > + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ Move the platform test out of here (since it is already a per-platform vfunc) and then skip the duplicated gen9 functions. > + u32 *ptr_data = (u32 *) &val; \ > + unsigned i = 0; \ > + for (i = 0; i < x/32; i++) { \ And tidy up the reassignments. > + __gen9_decoupled_mmio_access(dev_priv, \ > + (offset + i*4), \ > + ptr_data + i, \ > + pd_engine, \ > + GEN9_DECOUPLED_OP_READ); \ > + ptr_data++; \ > + } \ > + } else { \ > + fw_engine = __gen9_reg_read_fw_domains(offset); \ > + if (fw_engine) \ > + __force_wake_auto(dev_priv, fw_engine); \ > + val = __raw_i915_read##x(dev_priv, reg); \ > + } \ > + GEN6_READ_FOOTER; \ Misleading indentation. > } > > __gen9_read(8) > @@ -1101,11 +1181,26 @@ static void \ > gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \ > bool trace) { \ > enum forcewake_domains fw_engine; \ > + enum power_domains pd_engine; \ > GEN6_WRITE_HEADER; \ > - fw_engine = __gen9_reg_write_fw_domains(offset); \ > - if (fw_engine) \ > - __force_wake_auto(dev_priv, fw_engine); \ > - __raw_i915_write##x(dev_priv, reg, val); \ > + pd_engine = __gen9_reg_write_power_domains(offset); \ > + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ > + u32 *ptr_data = (u32 *) &val; \ > + unsigned i = 0; \ > + for (i = 0; i < x/32; i++) { \ > + __gen9_decoupled_mmio_access(dev_priv, \ > + (offset + i*4), \ > + ptr_data + i, \ > + pd_engine, \ > + GEN9_DECOUPLED_OP_WRITE); \ > + ptr_data++; \ > + } \ This is scary for a 64bit write. They are assumed to be an atomic transaction with hw - when they are not we encounter fun races where the hardware operates on the intermediate state. Hence we avoid them. -Chisr
On Tuesday 06 September 2016 12:06 PM, Chris Wilson wrote: > On Tue, Sep 06, 2016 at 10:54:14AM +0530, Praveen Paneri wrote: >> Decoupled MMIO is an alternative way to access forcewake domain >> registers, which requires less cycles and avoids frequent software >> forcewake. > > How about when forcewake is already held? You'll note that we still Will try to add the same check (for domain->wake_count) in decoupled MMIO path as well and do a direct register access if forcewake is already held. > require irq-spinlocks so the mmio access is still not great. And we > still will have to frequently take forcewake manually, apparently. > > Do you have any statistics to say that we do reduce grabing the fw > wakelock and that the busywait you add instead is negligible. You are > still using a 50ms timeout, so there is some doubt about "less cycles". Sorry didn't find any such statistics with Windows folks. But can do an exercise myself to measure the actual benefit of Decoupled MMIO. Please can you suggest some method to do that. The feature definitely helps HW for synchronization as the cycles are internally serialized in GT and eliminates the risk of hitting certain hangs which exist in theory. > >> +/* >> + * Decoupled MMIO access for only 1 DWORD >> + */ >> +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv, >> + uint32_t reg, u32 *ptr_data, >> + enum power_domains pd, int operation) >> +{ >> + u32 ctrl_reg_data = 0; >> + >> + if (operation == GEN9_DECOUPLED_OP_WRITE) >> + __raw_i915_write32(dev_priv, >> + GEN9_DECOUPLED_REG0_DW0, >> + *ptr_data); >> + >> + ctrl_reg_data |= reg; >> + ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT); >> + ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT); >> + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); >> + >> + ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO; >> + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); >> + >> + if (wait_for_atomic((__raw_i915_read32(dev_priv, >> + GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0, >> + FORCEWAKE_ACK_TIMEOUT_MS)) >> + DRM_ERROR("Decoupled MMIO wait timed out\n"); >> + >> + if (operation == GEN9_DECOUPLED_OP_READ) >> + *ptr_data = __raw_i915_read32(dev_priv, >> + GEN9_DECOUPLED_REG0_DW0); >> +} >> + >> #define GEN2_READ_HEADER(x) \ >> u##x val = 0; \ >> assert_rpm_wakelock_held(dev_priv); >> @@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ >> static u##x \ >> gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ >> enum forcewake_domains fw_engine; \ >> + enum power_domains pd_engine; \ >> GEN6_READ_HEADER(x); \ >> - fw_engine = __gen9_reg_read_fw_domains(offset); \ >> - if (fw_engine) \ >> - __force_wake_auto(dev_priv, fw_engine); \ >> - val = __raw_i915_read##x(dev_priv, reg); \ >> - GEN6_READ_FOOTER; \ >> + pd_engine = __gen9_reg_read_power_domains(offset); \ >> + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ > > Move the platform test out of here (since it is already a per-platform > vfunc) and then skip the duplicated gen9 functions. > >> + u32 *ptr_data = (u32 *) &val; \ >> + unsigned i = 0; \ >> + for (i = 0; i < x/32; i++) { \ > > And tidy up the reassignments. > >> + __gen9_decoupled_mmio_access(dev_priv, \ >> + (offset + i*4), \ >> + ptr_data + i, \ >> + pd_engine, \ >> + GEN9_DECOUPLED_OP_READ); \ >> + ptr_data++; \ >> + } \ >> + } else { \ >> + fw_engine = __gen9_reg_read_fw_domains(offset); \ >> + if (fw_engine) \ >> + __force_wake_auto(dev_priv, fw_engine); \ >> + val = __raw_i915_read##x(dev_priv, reg); \ >> + } \ >> + GEN6_READ_FOOTER; \ > > Misleading indentation. > >> } >> >> __gen9_read(8) >> @@ -1101,11 +1181,26 @@ static void \ >> gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \ >> bool trace) { \ >> enum forcewake_domains fw_engine; \ >> + enum power_domains pd_engine; \ >> GEN6_WRITE_HEADER; \ >> - fw_engine = __gen9_reg_write_fw_domains(offset); \ >> - if (fw_engine) \ >> - __force_wake_auto(dev_priv, fw_engine); \ >> - __raw_i915_write##x(dev_priv, reg, val); \ >> + pd_engine = __gen9_reg_write_power_domains(offset); \ >> + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ >> + u32 *ptr_data = (u32 *) &val; \ >> + unsigned i = 0; \ >> + for (i = 0; i < x/32; i++) { \ >> + __gen9_decoupled_mmio_access(dev_priv, \ >> + (offset + i*4), \ >> + ptr_data + i, \ >> + pd_engine, \ >> + GEN9_DECOUPLED_OP_WRITE); \ >> + ptr_data++; \ >> + } \ > > This is scary for a 64bit write. They are assumed to be an atomic > transaction with hw - when they are not we encounter fun races where the > hardware operates on the intermediate state. Hence we avoid them. Decoupled MMIO currently doesn't support single 64 bit write. We can continue to use existing method for 64 bit writes. Thanks, Praveen > -Chisr >
On 15/11/2016 18:15, Patchwork wrote: > == Series Details == > > Series: drm/i915/bxt: Broxton decoupled MMIO (rev5) > URL : https://patchwork.freedesktop.org/series/12028/ > State : success > > == Summary == > > Series 12028v5 drm/i915/bxt: Broxton decoupled MMIO > https://patchwork.freedesktop.org/api/1.0/series/12028/revisions/5/mbox/ > > > fi-bdw-5557u total:244 pass:229 dwarn:0 dfail:0 fail:0 skip:15 > fi-bsw-n3050 total:244 pass:204 dwarn:0 dfail:0 fail:0 skip:40 > fi-bxt-t5700 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 > fi-byt-j1900 total:244 pass:216 dwarn:0 dfail:0 fail:0 skip:28 > fi-byt-n2820 total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 > fi-hsw-4770 total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 > fi-hsw-4770r total:244 pass:224 dwarn:0 dfail:0 fail:0 skip:20 > fi-ilk-650 total:244 pass:191 dwarn:0 dfail:0 fail:0 skip:53 > fi-ivb-3520m total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 > fi-ivb-3770 total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 > fi-kbl-7200u total:244 pass:222 dwarn:0 dfail:0 fail:0 skip:22 > fi-skl-6260u total:244 pass:230 dwarn:0 dfail:0 fail:0 skip:14 > fi-skl-6700hq total:244 pass:223 dwarn:0 dfail:0 fail:0 skip:21 > fi-skl-6700k total:244 pass:222 dwarn:1 dfail:0 fail:0 skip:21 > fi-skl-6770hq total:244 pass:230 dwarn:0 dfail:0 fail:0 skip:14 > fi-snb-2520m total:244 pass:212 dwarn:0 dfail:0 fail:0 skip:32 > fi-snb-2600 total:244 pass:211 dwarn:0 dfail:0 fail:0 skip:33 > > 04145fe15cf8c81c221e62fc9d65d93053f9bd1a drm-intel-nightly: 2016y-11m-15d-14h-47m-07s UTC integration manifest > 4acf91b drm/i915/bxt: Broxton decoupled MMIO Merged to dinq - thanks for the patch! Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c413587..1ecda04 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -558,6 +558,16 @@ enum forcewake_domains { #define FW_REG_READ (1) #define FW_REG_WRITE (2) +enum power_domains { + GEN9_DECOUPLED_PD_BLITTER = 0, + GEN9_DECOUPLED_PD_RENDER, + GEN9_DECOUPLED_PD_MEDIA, + GEN9_DECOUPLED_PD_ALL +}; + +#define GEN9_DECOUPLED_OP_WRITE 0 +#define GEN9_DECOUPLED_OP_READ 1 + enum forcewake_domains intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv, i915_reg_t reg, unsigned int op); @@ -2842,6 +2852,7 @@ struct drm_i915_cmd_table { #define GT_FREQUENCY_MULTIPLIER 50 #define GEN9_FREQ_SCALER 3 +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) && IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER)) #include "i915_trace.h" static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index a29d707..1a7acdf 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7394,6 +7394,13 @@ enum { #define SKL_FUSE_PG1_DIST_STATUS (1<<26) #define SKL_FUSE_PG2_DIST_STATUS (1<<25) +/* Decoupled MMIO register pair for kernel driver */ +#define GEN9_DECOUPLED_REG0_DW0 _MMIO(0xF00) +#define GEN9_DECOUPLED_REG0_DW1 _MMIO(0xF04) +#define GEN9_DECOUPLED_DW1_GO (1<<31) +#define GEN9_DECOUPLED_PD_SHIFT 28 +#define GEN9_DECOUPLED_OP_SHIFT 24 + /* Per-pipe DDI Function Control */ #define _TRANS_DDI_FUNC_CTL_A 0x60400 #define _TRANS_DDI_FUNC_CTL_B 0x61400 diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index e9f68cd..d19ee2f 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -745,6 +745,22 @@ static bool is_gen8_shadowed(u32 offset) __fwd; \ }) +#define __gen9_reg_read_power_domains(offset) \ +({ \ + enum power_domains __dpd; \ + if (!SKL_NEEDS_FORCE_WAKE(offset)) \ + __dpd = 0; \ + else if (FORCEWAKE_GEN9_RENDER_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_RENDER; \ + else if (FORCEWAKE_GEN9_MEDIA_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_MEDIA; \ + else if (FORCEWAKE_GEN9_COMMON_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_ALL; \ + else \ + __dpd = GEN9_DECOUPLED_PD_BLITTER; \ + __dpd; \ +}) + static const i915_reg_t gen9_shadowed_regs[] = { RING_TAIL(RENDER_RING_BASE), RING_TAIL(GEN6_BSD_RING_BASE), @@ -781,6 +797,23 @@ static bool is_gen9_shadowed(u32 offset) __fwd; \ }) +#define __gen9_reg_write_power_domains(offset) \ +({ \ + enum power_domains __dpd; \ + if (!SKL_NEEDS_FORCE_WAKE(offset) || is_gen9_shadowed(offset)) \ + __dpd = 0; \ + else if (FORCEWAKE_GEN9_RENDER_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_RENDER; \ + else if (FORCEWAKE_GEN9_MEDIA_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_MEDIA; \ + else if (FORCEWAKE_GEN9_COMMON_RANGE_OFFSET(offset)) \ + __dpd = GEN9_DECOUPLED_PD_ALL; \ + else \ + __dpd = GEN9_DECOUPLED_PD_BLITTER; \ + __dpd; \ +}) + + static void ilk_dummy_write(struct drm_i915_private *dev_priv) { @@ -816,6 +849,38 @@ unclaimed_reg_debug(struct drm_i915_private *dev_priv, __unclaimed_reg_debug(dev_priv, reg, read, before); } +/* + * Decoupled MMIO access for only 1 DWORD + */ +static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv, + uint32_t reg, u32 *ptr_data, + enum power_domains pd, int operation) +{ + u32 ctrl_reg_data = 0; + + if (operation == GEN9_DECOUPLED_OP_WRITE) + __raw_i915_write32(dev_priv, + GEN9_DECOUPLED_REG0_DW0, + *ptr_data); + + ctrl_reg_data |= reg; + ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT); + ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT); + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); + + ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO; + __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data); + + if (wait_for_atomic((__raw_i915_read32(dev_priv, + GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0, + FORCEWAKE_ACK_TIMEOUT_MS)) + DRM_ERROR("Decoupled MMIO wait timed out\n"); + + if (operation == GEN9_DECOUPLED_OP_READ) + *ptr_data = __raw_i915_read32(dev_priv, + GEN9_DECOUPLED_REG0_DW0); +} + #define GEN2_READ_HEADER(x) \ u##x val = 0; \ assert_rpm_wakelock_held(dev_priv); @@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ static u##x \ gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \ enum forcewake_domains fw_engine; \ + enum power_domains pd_engine; \ GEN6_READ_HEADER(x); \ - fw_engine = __gen9_reg_read_fw_domains(offset); \ - if (fw_engine) \ - __force_wake_auto(dev_priv, fw_engine); \ - val = __raw_i915_read##x(dev_priv, reg); \ - GEN6_READ_FOOTER; \ + pd_engine = __gen9_reg_read_power_domains(offset); \ + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ + u32 *ptr_data = (u32 *) &val; \ + unsigned i = 0; \ + for (i = 0; i < x/32; i++) { \ + __gen9_decoupled_mmio_access(dev_priv, \ + (offset + i*4), \ + ptr_data + i, \ + pd_engine, \ + GEN9_DECOUPLED_OP_READ); \ + ptr_data++; \ + } \ + } else { \ + fw_engine = __gen9_reg_read_fw_domains(offset); \ + if (fw_engine) \ + __force_wake_auto(dev_priv, fw_engine); \ + val = __raw_i915_read##x(dev_priv, reg); \ + } \ + GEN6_READ_FOOTER; \ } __gen9_read(8) @@ -1101,11 +1181,26 @@ static void \ gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \ bool trace) { \ enum forcewake_domains fw_engine; \ + enum power_domains pd_engine; \ GEN6_WRITE_HEADER; \ - fw_engine = __gen9_reg_write_fw_domains(offset); \ - if (fw_engine) \ - __force_wake_auto(dev_priv, fw_engine); \ - __raw_i915_write##x(dev_priv, reg, val); \ + pd_engine = __gen9_reg_write_power_domains(offset); \ + if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \ + u32 *ptr_data = (u32 *) &val; \ + unsigned i = 0; \ + for (i = 0; i < x/32; i++) { \ + __gen9_decoupled_mmio_access(dev_priv, \ + (offset + i*4), \ + ptr_data + i, \ + pd_engine, \ + GEN9_DECOUPLED_OP_WRITE); \ + ptr_data++; \ + } \ + } else { \ + fw_engine = __gen9_reg_write_fw_domains(offset); \ + if (fw_engine) \ + __force_wake_auto(dev_priv, fw_engine); \ + __raw_i915_write##x(dev_priv, reg, val); \ + } \ GEN6_WRITE_FOOTER; \ }