diff mbox

[RFC,04/20] drm/i915: Transform context WAs into static tables

Message ID 1509732588-10599-5-git-send-email-oscar.mateo@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

oscar.mateo@intel.com Nov. 3, 2017, 6:09 p.m. UTC
This is for WAs that need to touch registers that get saved/restored
together with the logical context. The idea is that WAs are "pretty"
static, so a table is more declarative than a programmatic approah.
Note however that some amount is caching is needed for those things
that are dynamic (e.g. things that need some calculation, or have
a criteria different than the more obvious GEN + stepping).

Also, this makes very explicit which WAs live in the context.

Suggested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  40 +-
 drivers/gpu/drm/i915/i915_drv.h          |  35 +-
 drivers/gpu/drm/i915/i915_gem_context.c  |   4 -
 drivers/gpu/drm/i915/intel_workarounds.c | 754 +++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_workarounds.h |   4 +-
 5 files changed, 466 insertions(+), 371 deletions(-)

Comments

Joonas Lahtinen Nov. 6, 2017, 11:59 a.m. UTC | #1
On Fri, 2017-11-03 at 11:09 -0700, Oscar Mateo wrote:
> This is for WAs that need to touch registers that get saved/restored
> together with the logical context. The idea is that WAs are "pretty"
> static, so a table is more declarative than a programmatic approah.
> Note however that some amount is caching is needed for those things
> that are dynamic (e.g. things that need some calculation, or have
> a criteria different than the more obvious GEN + stepping).
> 
> Also, this makes very explicit which WAs live in the context.
> 
> Suggested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

<SNIP>

> +struct i915_wa_reg;
> +
> +typedef bool (* wa_pre_hook_func)(struct drm_i915_private *dev_priv,
> +				  struct i915_wa_reg *wa);
> +typedef void (* wa_post_hook_func)(struct drm_i915_private *dev_priv,
> +				   struct i915_wa_reg *wa);

To avoid carrying any variables over, how about just apply() hook?
Also, you don't have to have "_hook" going there, it's tak

>  struct i915_wa_reg {
> +	const char *name;

We may want some Kconfig option for skipping these.

> +	enum wa_type {
> +		I915_WA_TYPE_CONTEXT = 0,
> +		I915_WA_TYPE_GT,
> +		I915_WA_TYPE_DISPLAY,
> +		I915_WA_TYPE_WHITELIST
> +	} type;
> +

Any specific reason not to have the gen here too? Then you can have one
big table, instead of tables of tables. Then the numeric code of a WA
(position in that table) would be equally identifying it compared to
the WA name (which is nice to have information, so config time opt-in).

> +	u8 since;
> +	u8 until;

Most seem to have ALL_REVS, so this could be after the coarse-grained
gen-check in the apply function.

> +
>  	i915_reg_t addr;
> -	u32 value;
> -	/* bitmask representing WA bits */
>  	u32 mask;
> +	u32 value;
> +	bool is_masked_reg;

I'd hide this detail into the apply function.

> +
> +	wa_pre_hook_func pre_hook;
> +	wa_post_hook_func post_hook;

	bool (*apply)(const struct i915_wa *wa,
		      struct drm_i915_private *dev_priv);

> +	u32 hook_data;
> +	bool applied;

The big point would be to make this into const, so "applied" would
defeat that.

<SNIP>

> +#define MASK(mask, value)	((mask) << 16 | (value))
> +#define MASK_ENABLE(x)		(MASK((x), (x)))
> +#define MASK_DISABLE(x)		(MASK((x), 0))
>  
> -#define WA_REG(addr, mask, val) do { \
> -		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
> -		if (r) \
> -			return r; \
> -	} while (0)
> +#define SET_BIT_MASKED(m) 		\
> +	.mask = (m),			\
> +	.value = MASK_ENABLE(m),	\
> +	.is_masked_reg = true
>  
> -#define WA_SET_BIT_MASKED(addr, mask) \
> -	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
> +#define CLEAR_BIT_MASKED( m) 		\
> +	.mask = (m),			\
> +	.value = MASK_DISABLE(m),	\
> +	.is_masked_reg = true
>  
> -#define WA_CLR_BIT_MASKED(addr, mask) \
> -	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
> +#define SET_FIELD_MASKED(m, v) 		\
> +	.mask = (m),			\
> +	.value = MASK(m, v),		\
> +	.is_masked_reg = true

Lets try to have the struct i915_wa as small as possible, so this could
be calculated in the apply function.

So, avoiding the macros this would indeed become rather declarative;

{
	WA_NAME("WaDisableAsyncFlipPerfMode")
	.gen = ...,
	.reg = MI_MODE,
	.value = ASYNC_FLIP_PERF_DISABLE,
	.apply = set_bit_masked,
},

Or, we could also have;

static const struct i915_wa WaDisableAsyncFlipPerfMode = {
	.gen = ...,
	.reg = MI_MODE,
	.value = ASYNC_FLIP_PERF_DISABLE,
	.apply = set_bit_masked,
};

And then one array of those.

	WA(WaDisableAsyncFlipPerfMode),

Then you could at compile time decide if you stringify and store the
name. But that'd be more const data than necessary (pointers to
structs, instead of an array of structs).

Regards, Joonas
oscar.mateo@intel.com Nov. 6, 2017, 6:54 p.m. UTC | #2
On 11/06/2017 03:59 AM, Joonas Lahtinen wrote:
> On Fri, 2017-11-03 at 11:09 -0700, Oscar Mateo wrote:
>> This is for WAs that need to touch registers that get saved/restored
>> together with the logical context. The idea is that WAs are "pretty"
>> static, so a table is more declarative than a programmatic approah.
>> Note however that some amount is caching is needed for those things
>> that are dynamic (e.g. things that need some calculation, or have
>> a criteria different than the more obvious GEN + stepping).
>>
>> Also, this makes very explicit which WAs live in the context.
>>
>> Suggested-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>> Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> <SNIP>
>
>> +struct i915_wa_reg;
>> +
>> +typedef bool (* wa_pre_hook_func)(struct drm_i915_private *dev_priv,
>> +				  struct i915_wa_reg *wa);
>> +typedef void (* wa_post_hook_func)(struct drm_i915_private *dev_priv,
>> +				   struct i915_wa_reg *wa);
> To avoid carrying any variables over, how about just apply() hook?
> Also, you don't have to have "_hook" going there, it's tak
>

Not all WAs are applied in the same way: ctx-style workarounds are 
emitted as LRI commands to the ring. Do you treat those differently?

>>   struct i915_wa_reg {
>> +	const char *name;
> We may want some Kconfig option for skipping these.

Sure. But we should try to decide first if we want to store this at all, 
like: what do we expect to use this for? is it worth it?

>> +	enum wa_type {
>> +		I915_WA_TYPE_CONTEXT = 0,
>> +		I915_WA_TYPE_GT,
>> +		I915_WA_TYPE_DISPLAY,
>> +		I915_WA_TYPE_WHITELIST
>> +	} type;
>> +
> Any specific reason not to have the gen here too? Then you can have one
> big table, instead of tables of tables. Then the numeric code of a WA
> (position in that table) would be equally identifying it compared to
> the WA name (which is nice to have information, so config time opt-in).

Such a "big table" would be quite big, indeed. And we know we want to 
apply the workarounds from at least four different places, so looping 
through the table each and every time to find the relevant WAs seems 
like a waste. Also, in some places we would have to loop more than once 
( to know the number of WAs to apply before we can reserve space in the 
ring for ctx-style WAs, for example).

I could also go for 4 slightly smaller tables (one per type of WA) but 
then there is another problem to solve: how do you record WAs that apply 
for all revisions of one GEN, but a smaller number of revisions of 
another? (e.g. WaDisableFenceDestinationToSLM applies to all BDW 
steppings but only KBL A0).

>> +	u8 since;
>> +	u8 until;
> Most seem to have ALL_REVS, so this could be after the coarse-grained
> gen-check in the apply function.

So every single WA that applies to specific REVS gets an "apply" 
function? That looks like a lot of functions (I count 25 WAs that only 
apply to some steppings already). Or are you simply saying here that I 
check the GEN before checking the stepping (which is the only order that 
makes sense anyway)?

>> +
>>   	i915_reg_t addr;
>> -	u32 value;
>> -	/* bitmask representing WA bits */
>>   	u32 mask;
>> +	u32 value;
>> +	bool is_masked_reg;
> I'd hide this detail into the apply function.

I see. But if you don't store the mask: what do you output in debugfs?

>
>> +
>> +	wa_pre_hook_func pre_hook;
>> +	wa_post_hook_func post_hook;
> 	bool (*apply)(const struct i915_wa *wa,
> 		      struct drm_i915_private *dev_priv);
>
>> +	u32 hook_data;
>> +	bool applied;
> The big point would be to make this into const, so "applied" would
> defeat that.

Yeah, I realized. Keeping a separate bitmask of which WAs have been 
applied is not a big deal, but then I became aware that there are many 
more things that would need to be cached. For example, some WAs require 
to compute the actual value you write into their register. What do you 
do with those? (remember that you still want to print the expected value 
in debugfs for these).

> <SNIP>
>
>> +#define MASK(mask, value)	((mask) << 16 | (value))
>> +#define MASK_ENABLE(x)		(MASK((x), (x)))
>> +#define MASK_DISABLE(x)		(MASK((x), 0))
>>   
>> -#define WA_REG(addr, mask, val) do { \
>> -		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
>> -		if (r) \
>> -			return r; \
>> -	} while (0)
>> +#define SET_BIT_MASKED(m) 		\
>> +	.mask = (m),			\
>> +	.value = MASK_ENABLE(m),	\
>> +	.is_masked_reg = true
>>   
>> -#define WA_SET_BIT_MASKED(addr, mask) \
>> -	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
>> +#define CLEAR_BIT_MASKED( m) 		\
>> +	.mask = (m),			\
>> +	.value = MASK_DISABLE(m),	\
>> +	.is_masked_reg = true
>>   
>> -#define WA_CLR_BIT_MASKED(addr, mask) \
>> -	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
>> +#define SET_FIELD_MASKED(m, v) 		\
>> +	.mask = (m),			\
>> +	.value = MASK(m, v),		\
>> +	.is_masked_reg = true
> Lets try to have the struct i915_wa as small as possible, so this could
> be calculated in the apply function.
>
> So, avoiding the macros this would indeed become rather declarative;
>
> {
> 	WA_NAME("WaDisableAsyncFlipPerfMode")
> 	.gen = ...,
> 	.reg = MI_MODE,
> 	.value = ASYNC_FLIP_PERF_DISABLE,
> 	.apply = set_bit_masked,
> },
> Or, we could also have;
>
> static const struct i915_wa WaDisableAsyncFlipPerfMode = {
> 	.gen = ...,
> 	.reg = MI_MODE,
> 	.value = ASYNC_FLIP_PERF_DISABLE,
> 	.apply = set_bit_masked,
> };
>
> And then one array of those.
>
> 	WA(WaDisableAsyncFlipPerfMode),

This is the list of problems we need to solve before we can go forward 
with this design:

- What to do with WAs that don't know a priori what .value should be, 
because it gets computed in places like skl_tune_iz_hashing or 
use_gtt_cache? (yes, computing in the apply function is the immediate 
answer, but then... how do you output that in debugfs?).
- What to do with context-style WAs, that are emitted instead of 
applied, as I mentioned above?.
- What to do with whitelist-style functions, where you need to access 
the .reg field of i915_reg_t to know the .value? Also, the .reg depends 
on the engine (although I guess you can always statically codify that in 
the table and apply the whitelist WAs later, once all the engines are up).
- You are not storing .since/.until. Does that mean every WA that 
applies to only some steppings gets a custom apply function?.
- If you don't store the computed mask anywhere, what do you output in 
debugfs? (which is the real improvement we want to achieve?).
- Something to be careful about: some WAs are named the same, but their 
reg/value is different (because the register has changed in one 
particular GEN or whatever). The solution could be a modifier to the 
name (WaSomething_bdw_chv and  WaSomething_skl) but this could be a 
source of errors.

> Then you could at compile time decide if you stringify and store the
> name. But that'd be more const data than necessary (pointers to
> structs, instead of an array of structs).
>
> Regards, Joonas

One more thing: I still urge to reconsider merging what we already have, 
and doing these improvements (once we agree on a design) later on. The 
reason being that the sooner we get a list of all WAs in debugfs, the 
better (which can be used later on to verify any further improvements we 
do).

Thanks for the review,
Oscar
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 39883cd..12c4330 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -30,6 +30,7 @@ 
 #include <linux/sort.h>
 #include <linux/sched/mm.h>
 #include "intel_drv.h"
+#include "intel_workarounds.h"
 #include "i915_guc_submission.h"
 
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -3357,13 +3358,16 @@  static int i915_shared_dplls_info(struct seq_file *m, void *unused)
 
 static int i915_wa_registers(struct seq_file *m, void *unused)
 {
-	int i;
-	int ret;
 	struct intel_engine_cs *engine;
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct drm_device *dev = &dev_priv->drm;
 	struct i915_workarounds *workarounds = &dev_priv->workarounds;
+	const struct i915_wa_reg_table *wa_table;
+	uint table_count;
 	enum intel_engine_id id;
+	int i, j, ret;
+
+	intel_ctx_workarounds_get(dev_priv, &wa_table, &table_count);
 
 	ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
@@ -3371,22 +3375,28 @@  static int i915_wa_registers(struct seq_file *m, void *unused)
 
 	intel_runtime_pm_get(dev_priv);
 
-	seq_printf(m, "Workarounds applied: %d\n", workarounds->count);
+	seq_printf(m, "Workarounds applied: %d\n", workarounds->ctx_count);
 	for_each_engine(engine, dev_priv, id)
 		seq_printf(m, "HW whitelist count for %s: %d\n",
 			   engine->name, workarounds->hw_whitelist_count[id]);
-	for (i = 0; i < workarounds->count; ++i) {
-		i915_reg_t addr;
-		u32 mask, value, read;
-		bool ok;
-
-		addr = workarounds->reg[i].addr;
-		mask = workarounds->reg[i].mask;
-		value = workarounds->reg[i].value;
-		read = I915_READ(addr);
-		ok = (value & mask) == (read & mask);
-		seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X, read: 0x%08x, status: %s\n",
-			   i915_mmio_reg_offset(addr), value, mask, read, ok ? "OK" : "FAIL");
+
+	for (i = 0; i < table_count; i++) {
+		const struct i915_wa_reg *wa = wa_table[i].table;
+
+		for (j = 0; j < wa_table[i].count; j++) {
+			u32 read;
+			bool ok;
+
+			if (!wa[j].applied)
+				continue;
+
+			read = I915_READ(wa[j].addr);
+			ok = (wa[j].value & wa[j].mask) == (read & wa[j].mask);
+			seq_printf(m,
+				   "0x%X: 0x%08X, mask: 0x%08X, read: 0x%08x, status: %s, name: %s\n",
+				   i915_mmio_reg_offset(wa[j].addr), wa[j].value,
+				   wa[j].mask, read, ok ? "OK" : "FAIL", wa[j].name);
+		}
 	}
 
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4a7325c..1c73fec 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1970,18 +1970,43 @@  struct i915_frontbuffer_tracking {
 	unsigned flip_bits;
 };
 
+struct i915_wa_reg;
+
+typedef bool (* wa_pre_hook_func)(struct drm_i915_private *dev_priv,
+				  struct i915_wa_reg *wa);
+typedef void (* wa_post_hook_func)(struct drm_i915_private *dev_priv,
+				   struct i915_wa_reg *wa);
+
 struct i915_wa_reg {
+	const char *name;
+	enum wa_type {
+		I915_WA_TYPE_CONTEXT = 0,
+		I915_WA_TYPE_GT,
+		I915_WA_TYPE_DISPLAY,
+		I915_WA_TYPE_WHITELIST
+	} type;
+
+	u8 since;
+	u8 until;
+
 	i915_reg_t addr;
-	u32 value;
-	/* bitmask representing WA bits */
 	u32 mask;
+	u32 value;
+	bool is_masked_reg;
+
+	wa_pre_hook_func pre_hook;
+	wa_post_hook_func post_hook;
+	u32 hook_data;
+	bool applied;
 };
 
-#define I915_MAX_WA_REGS 16
+struct i915_wa_reg_table {
+	struct i915_wa_reg *table;
+	int count;
+};
 
 struct i915_workarounds {
-	struct i915_wa_reg reg[I915_MAX_WA_REGS];
-	u32 count;
+	u32 ctx_count;
 	u32 hw_whitelist_count[I915_NUM_ENGINES];
 };
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8548e571..7d04b5e 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -457,10 +457,6 @@  int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
 
 	GEM_BUG_ON(dev_priv->kernel_context);
 
-	err = intel_ctx_workarounds_init(dev_priv);
-	if (err)
-		goto err;
-
 	INIT_LIST_HEAD(&dev_priv->contexts.list);
 	INIT_WORK(&dev_priv->contexts.free_work, contexts_free_worker);
 	init_llist_head(&dev_priv->contexts.free_list);
diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
index 0a8f265..b00899e 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/intel_workarounds.c
@@ -25,61 +25,65 @@ 
 #include "i915_drv.h"
 #include "intel_workarounds.h"
 
-static int wa_add(struct drm_i915_private *dev_priv,
-		  i915_reg_t addr,
-		  const u32 mask, const u32 val)
-{
-	const u32 idx = dev_priv->workarounds.count;
+#define WA_CTX(wa)			\
+	.name = (wa),			\
+	.type = I915_WA_TYPE_CONTEXT
 
-	if (WARN_ON(idx >= I915_MAX_WA_REGS))
-		return -ENOSPC;
+#define ALL_REVS		\
+	.since = 0,		\
+	.until = REVID_FOREVER
 
-	dev_priv->workarounds.reg[idx].addr = addr;
-	dev_priv->workarounds.reg[idx].value = val;
-	dev_priv->workarounds.reg[idx].mask = mask;
+#define REVS(s, u)	\
+	.since = (s),	\
+	.until = (u)
 
-	dev_priv->workarounds.count++;
+#define REG(a)		\
+	.addr = (a)
 
-	return 0;
-}
+#define MASK(mask, value)	((mask) << 16 | (value))
+#define MASK_ENABLE(x)		(MASK((x), (x)))
+#define MASK_DISABLE(x)		(MASK((x), 0))
 
-#define WA_REG(addr, mask, val) do { \
-		const int r = wa_add(dev_priv, (addr), (mask), (val)); \
-		if (r) \
-			return r; \
-	} while (0)
+#define SET_BIT_MASKED(m) 		\
+	.mask = (m),			\
+	.value = MASK_ENABLE(m),	\
+	.is_masked_reg = true
 
-#define WA_SET_BIT_MASKED(addr, mask) \
-	WA_REG(addr, (mask), _MASKED_BIT_ENABLE(mask))
+#define CLEAR_BIT_MASKED( m) 		\
+	.mask = (m),			\
+	.value = MASK_DISABLE(m),	\
+	.is_masked_reg = true
 
-#define WA_CLR_BIT_MASKED(addr, mask) \
-	WA_REG(addr, (mask), _MASKED_BIT_DISABLE(mask))
+#define SET_FIELD_MASKED(m, v) 		\
+	.mask = (m),			\
+	.value = MASK(m, v),		\
+	.is_masked_reg = true
 
-#define WA_SET_FIELD_MASKED(addr, mask, value) \
-	WA_REG(addr, mask, _MASKED_FIELD(mask, value))
-
-static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
+static struct i915_wa_reg gen8_ctx_was[] = {
+	{ WA_CTX(""),
+	  ALL_REVS, REG(INSTPM),
+	  SET_BIT_MASKED(INSTPM_FORCE_ORDERING) },
 
-	/* WaDisableAsyncFlipPerfMode:bdw,chv */
-	WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE);
+	{ WA_CTX("WaDisableAsyncFlipPerfMode"),
+	  ALL_REVS, REG(MI_MODE),
+	  SET_BIT_MASKED(ASYNC_FLIP_PERF_DISABLE) },
 
-	/* WaDisablePartialInstShootdown:bdw,chv */
-	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
-			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
+	{ WA_CTX("WaDisablePartialInstShootdown"),
+	  ALL_REVS, REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE) },
 
-	/* Use Force Non-Coherent whenever executing a 3D context. This is a
+	/*
+	 * Use Force Non-Coherent whenever executing a 3D context. This is a
 	 * workaround for for a possible hang in the unlikely event a TLB
 	 * invalidation occurs during a PSD flush.
 	 */
-	/* WaForceEnableNonCoherent:bdw,chv */
-	/* WaHdcDisableFetchWhenMasked:bdw,chv */
-	WA_SET_BIT_MASKED(HDC_CHICKEN0,
-			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
-			  HDC_FORCE_NON_COHERENT);
+	{ WA_CTX("WaForceEnableNonCoherent + WaHdcDisableFetchWhenMasked"),
+	  ALL_REVS, REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FORCE_NON_COHERENT |
+			 HDC_DONOT_FETCH_MEM_WHEN_MASKED) },
 
-	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
+	/*
+	 * From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 	 *  polygons in the same 8x4 pixel/sample area to be processed without
 	 *  stalling waiting for the earlier ones to write to Hierarchical Z
@@ -87,10 +91,13 @@  static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	 *
 	 * This optimization is off by default for BDW and CHV; turn it on.
 	 */
-	WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
+	{ WA_CTX(""),
+	  ALL_REVS, REG(CACHE_MODE_0_GEN7),
+	  CLEAR_BIT_MASKED(HIZ_RAW_STALL_OPT_DISABLE) },
 
-	/* Wa4x4STCOptimizationDisable:bdw,chv */
-	WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
+	{ WA_CTX("Wa4x4STCOptimizationDisable"),
+	  ALL_REVS, REG(CACHE_MODE_1),
+	  SET_BIT_MASKED(GEN8_4x4_STC_OPTIMIZATION_DISABLE) },
 
 	/*
 	 * BSpec recommends 8x4 when MSAA is used,
@@ -100,154 +107,126 @@  static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	 * disable bit, which we don't touch here, but it's good
 	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
 	 */
-	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
-			    GEN6_WIZ_HASHING_MASK,
-			    GEN6_WIZ_HASHING_16x4);
-
-	return 0;
-}
+	{ WA_CTX(""),
+	  ALL_REVS, REG(GEN7_GT_MODE),
+	  SET_FIELD_MASKED(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4) },
+};
 
-static int bdw_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static bool is_bdw_gt3(struct drm_i915_private *dev_priv, struct i915_wa_reg *wa)
 {
-	int ret;
-
-	ret = gen8_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	return IS_BDW_GT3(dev_priv);
+}
 
-	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
-	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
+static struct i915_wa_reg bdw_ctx_was[] = {
+	{ WA_CTX("WaDisableThreadStallDopClockGating (pre-prod)"),
+	  ALL_REVS, REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(STALL_DOP_GATING_DISABLE) },
 
-	/* WaDisableDopClockGating:bdw
-	 *
+	/*
 	 * Also see the related UCGTCL1 write in broadwell_init_clock_gating()
 	 * to disable EUTC clock gating.
 	 */
-	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
-			  DOP_CLOCK_GATING_DISABLE);
-
-	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
-			  GEN8_SAMPLER_POWER_BYPASS_DIS);
+	{ WA_CTX("WaDisableDopClockGating"),
+	  ALL_REVS, REG(GEN7_ROW_CHICKEN2),
+	  SET_BIT_MASKED(DOP_CLOCK_GATING_DISABLE) },
 
-	WA_SET_BIT_MASKED(HDC_CHICKEN0,
-			  /* WaForceContextSaveRestoreNonCoherent:bdw */
-			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
-			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
-			  (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN8_SAMPLER_POWER_BYPASS_DIS) },
 
-	return 0;
-}
+	{ WA_CTX("WaForceContextSaveRestoreNonCoherent"),
+	  ALL_REVS, REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT) },
 
-static int chv_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
+	{ WA_CTX("WaDisableFenceDestinationToSLM (pre-prod)"),
+	  ALL_REVS, REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FENCE_DEST_SLM_DISABLE),
+	  .pre_hook = is_bdw_gt3 },
+};
 
-	ret = gen8_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
-
-	/* WaDisableThreadStallDopClockGating:chv */
-	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
+static struct i915_wa_reg chv_ctx_was[] = {
+	{ WA_CTX("WaDisableThreadStallDopClockGating"),
+	  ALL_REVS, REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(STALL_DOP_GATING_DISABLE) },
 
 	/* Improve HiZ throughput on CHV. */
-	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
-
-	return 0;
-}
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HIZ_CHICKEN),
+	  SET_BIT_MASKED(CHV_HZ_8X8_MODE_IN_1X) },
+};
 
-static int gen9_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static bool has_llc(struct drm_i915_private *dev_priv, struct i915_wa_reg *wa)
 {
-	if (HAS_LLC(dev_priv)) {
-		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
-		 *
-		 * Must match Display Engine. See
-		 * WaCompressedResourceDisplayNewHashMode.
-		 */
-		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-				  GEN9_PBE_COMPRESSED_HASH_SELECTION);
-		WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
-				  GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
-	}
-
-	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
-	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
-	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
-			  FLOW_CONTROL_ENABLE |
-			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
+	return HAS_LLC(dev_priv);
+}
 
-	/* Syncing dependencies between camera and graphics:skl,bxt,kbl */
-	if (!IS_COFFEELAKE(dev_priv))
-		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
-				  GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);
+static struct i915_wa_reg gen9_ctx_was[] = {
+	/*
+	 * Must match Display Engine. See
+	 * WaCompressedResourceDisplayNewHashMode.
+	 */
+	{ WA_CTX("WaCompressedResourceSamplerPbeMediaNewHashMode"),
+	  ALL_REVS, REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN9_PBE_COMPRESSED_HASH_SELECTION),
+	  .pre_hook = has_llc },
+	{ WA_CTX("WaCompressedResourceSamplerPbeMediaNewHashMode"),
+	  ALL_REVS, REG(GEN9_HALF_SLICE_CHICKEN7),
+	  SET_BIT_MASKED(GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR),
+	  .pre_hook = has_llc },
+
+	{ WA_CTX("WaClearFlowControlGpgpuContextSave + WaDisablePartialInstShootdown"),
+	  ALL_REVS, REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(FLOW_CONTROL_ENABLE |
+			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE) },
 
-	/* WaDisableDgMirrorFixInHalfSliceChicken5:bxt */
-	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
-		WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
-				  GEN9_DG_MIRROR_FIX_ENABLE);
+	/*
+	 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
+	 * but we do that in per ctx batchbuffer as there is an issue
+	 * with this register not getting restored on ctx restore
+	 */
+	{ WA_CTX("WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken"),
+	  REVS(0, BXT_REVID_A1), REG(GEN7_COMMON_SLICE_CHICKEN1),
+	  SET_BIT_MASKED(GEN9_RHWO_OPTIMIZATION_DISABLE) },
 
-	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:bxt */
-	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
-		WA_SET_BIT_MASKED(GEN7_COMMON_SLICE_CHICKEN1,
-				  GEN9_RHWO_OPTIMIZATION_DISABLE);
-		/*
-		 * WA also requires GEN9_SLICE_COMMON_ECO_CHICKEN0[14:14] to be set
-		 * but we do that in per ctx batchbuffer as there is an issue
-		 * with this register not getting restored on ctx restore
-		 */
-	}
+	{ WA_CTX("WaEnableYV12BugFixInHalfSliceChicken7 + WaEnableSamplerGPGPUPreemptionSupport"),
+	  ALL_REVS, REG(GEN9_HALF_SLICE_CHICKEN7),
+	  SET_BIT_MASKED(GEN9_ENABLE_YV12_BUGFIX |
+			 GEN9_ENABLE_GPGPU_PREEMPTION) },
 
-	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
-	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
-	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
-			  GEN9_ENABLE_YV12_BUGFIX |
-			  GEN9_ENABLE_GPGPU_PREEMPTION);
+	{ WA_CTX("Wa4x4STCOptimizationDisable + WaDisablePartialResolveInVc"),
+	  ALL_REVS, REG(CACHE_MODE_1),
+	  SET_BIT_MASKED(GEN8_4x4_STC_OPTIMIZATION_DISABLE |
+	      GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE) },
 
-	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
-	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
-	WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
-					 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
+	{ WA_CTX("WaCcsTlbPrefetchDisable"),
+	  ALL_REVS, REG(GEN9_HALF_SLICE_CHICKEN5),
+	  CLEAR_BIT_MASKED(GEN9_CCS_TLB_PREFETCH_ENABLE) },
 
-	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
-	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
-			  GEN9_CCS_TLB_PREFETCH_ENABLE);
+	{ WA_CTX("WaForceContextSaveRestoreNonCoherent"),
+	  ALL_REVS, REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
+	      HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE) },
 
-	/* WaDisableMaskBasedCammingInRCC:bxt */
-	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
-		WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
-				  PIXEL_MASK_CAMMING_DISABLE);
-
-	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
-	WA_SET_BIT_MASKED(HDC_CHICKEN0,
-			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
-			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
-
-	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
-	 * both tied to WaForceContextSaveRestoreNonCoherent
-	 * in some hsds for skl. We keep the tie for all gen9. The
-	 * documentation is a bit hazy and so we want to get common behaviour,
-	 * even though there is no clear evidence we would need both on kbl/bxt.
-	 * This area has been source of system hangs so we play it safe
-	 * and mimic the skl regardless of what bspec says.
+	/*
+	 * WaForceEnableNonCoherent and WaDisableHDCInvalidation are
+	 * both tied to WaForceContextSaveRestoreNonCoherent in some hsds for
+	 * skl. We keep the tie for all gen9. The documentation is a bit hazy
+	 * and so we want to get common behaviour, even though there is no clear
+	 * evidence we would need both on kbl/bxt. This area has been source of
+	 * system hangs so we play it safe and mimic the skl regardless of what
+	 * bspec says.
 	 *
 	 * Use Force Non-Coherent whenever executing a 3D context. This
 	 * is a workaround for a possible hang in the unlikely event
 	 * a TLB invalidation occurs during a PSD flush.
 	 */
+	{ WA_CTX("WaForceEnableNonCoherent"),
+	  ALL_REVS, REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FORCE_NON_COHERENT) },
 
-	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
-	WA_SET_BIT_MASKED(HDC_CHICKEN0,
-			  HDC_FORCE_NON_COHERENT);
-
-	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
-	if (IS_SKYLAKE(dev_priv) ||
-	    IS_KABYLAKE(dev_priv) ||
-	    IS_COFFEELAKE(dev_priv) ||
-	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
-		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
-				  GEN8_SAMPLER_POWER_BYPASS_DIS);
-
-	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
-	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
+	{ WA_CTX("WaDisableSTUnitPowerOptimization"),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_ST_PO_DISABLE) },
 
 	/*
 	 * Supporting preemption with fine-granularity requires changes in the
@@ -259,18 +238,17 @@  static int gen9_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	 * not real HW workarounds, but merely a way to start using preemption
 	 * while maintaining old contract with userspace.
 	 */
-
-	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
-	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
-
-	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
-	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
-			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
-
-	return 0;
-}
-
-static int skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
+	{ WA_CTX("WaDisable3DMidCmdPreemption"),
+	  ALL_REVS, REG(GEN8_CS_CHICKEN1),
+	  CLEAR_BIT_MASKED(GEN9_PREEMPT_3D_OBJECT_LEVEL) },
+	{ WA_CTX("WaDisableGPGPUMidCmdPreemption"),
+	  ALL_REVS, REG(GEN8_CS_CHICKEN1),
+	  SET_FIELD_MASKED(GEN9_PREEMPT_GPGPU_LEVEL_MASK,
+			   GEN9_PREEMPT_GPGPU_COMMAND_LEVEL) },
+};
+
+static bool skl_tune_iz_hashing(struct drm_i915_private *dev_priv,
+				struct i915_wa_reg *wa)
 {
 	u8 vals[3] = { 0, 0, 0 };
 	unsigned int i;
@@ -296,211 +274,295 @@  static int skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
 	}
 
 	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
-		return 0;
-
-	/* Tune IZ hashing. See intel_device_info_runtime_init() */
-	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
-			    GEN9_IZ_HASHING_MASK(2) |
-			    GEN9_IZ_HASHING_MASK(1) |
-			    GEN9_IZ_HASHING_MASK(0),
-			    GEN9_IZ_HASHING(2, vals[2]) |
-			    GEN9_IZ_HASHING(1, vals[1]) |
-			    GEN9_IZ_HASHING(0, vals[0]));
-
-	return 0;
-}
+		return false;
 
-static int skl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	wa->mask = GEN9_IZ_HASHING_MASK(2) |
+		   GEN9_IZ_HASHING_MASK(1) |
+		   GEN9_IZ_HASHING_MASK(0);
+	wa->value = _MASKED_FIELD(wa->mask, GEN9_IZ_HASHING(2, vals[2]) |
+					    GEN9_IZ_HASHING(1, vals[1]) |
+					    GEN9_IZ_HASHING(0, vals[0]));
 
-	return skl_tune_iz_hashing(dev_priv);
+	return true;
 }
 
-static int bxt_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
+static struct i915_wa_reg skl_ctx_was[] = {
+	/* Syncing dependencies between camera and graphics */
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC) },
 
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	{ WA_CTX("WaDisableSamplerPowerBypassForSOPingPong"),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN8_SAMPLER_POWER_BYPASS_DIS) },
 
-	/* WaDisableThreadStallDopClockGating:bxt */
-	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
-			  STALL_DOP_GATING_DISABLE);
+	/* Tune IZ hashing. See intel_device_info_runtime_init() */
+	{ WA_CTX(""),
+	  ALL_REVS, REG(GEN7_GT_MODE),
+	  .mask = 0, .value = 0,
+	  .pre_hook = skl_tune_iz_hashing },
+};
+
+static struct i915_wa_reg bxt_ctx_was[] = {
+	/* Syncing dependencies between camera and graphics */
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC) },
+
+	{ WA_CTX("WaDisableDgMirrorFixInHalfSliceChicken5"),
+	  REVS(0, BXT_REVID_A1), REG(GEN9_HALF_SLICE_CHICKEN5),
+	  CLEAR_BIT_MASKED(GEN9_DG_MIRROR_FIX_ENABLE) },
+
+	{ WA_CTX("WaDisableMaskBasedCammingInRCC"),
+	  REVS(0, BXT_REVID_A1), REG(SLICE_ECO_CHICKEN0),
+	  SET_BIT_MASKED(PIXEL_MASK_CAMMING_DISABLE) },
+
+	{ WA_CTX("WaDisableSamplerPowerBypassForSOPingPong"),
+	  REVS(0, BXT_REVID_B0), REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN8_SAMPLER_POWER_BYPASS_DIS) },
+
+	{ WA_CTX("WaDisableThreadStallDopClockGating"),
+	  ALL_REVS, REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(STALL_DOP_GATING_DISABLE) },
+
+	{ WA_CTX("WaDisableSbeCacheDispatchPortSharing"),
+	  REVS(0, BXT_REVID_B0), REG(GEN7_HALF_SLICE_CHICKEN1),
+	  SET_BIT_MASKED(GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE) },
+
+	{ WA_CTX("WaToEnableHwFixForPushConstHWBug"),
+	  REVS(BXT_REVID_C0, REVID_FOREVER), REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION) },
+};
+
+static struct i915_wa_reg kbl_ctx_was[] = {
+	/* Syncing dependencies between camera and graphics */
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC) },
+
+	{ WA_CTX("WaDisableSamplerPowerBypassForSOPingPong"),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN8_SAMPLER_POWER_BYPASS_DIS) },
+
+	{ WA_CTX("WaDisableFenceDestinationToSLM (pre-prod)"),
+	  REVS(KBL_REVID_A0, KBL_REVID_A0), REG(HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FENCE_DEST_SLM_DISABLE) },
+
+	{ WA_CTX("WaToEnableHwFixForPushConstHWBug"),
+	  REVS(KBL_REVID_C0, REVID_FOREVER), REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION) },
+
+	{ WA_CTX("WaDisableSbeCacheDispatchPortSharing"),
+	  ALL_REVS, REG(GEN7_HALF_SLICE_CHICKEN1),
+	  SET_BIT_MASKED(GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE) },
+};
+
+static struct i915_wa_reg glk_ctx_was[] = {
+	/* Syncing dependencies between camera and graphics */
+	{ WA_CTX(""),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC) },
+
+	{ WA_CTX("WaToEnableHwFixForPushConstHWBug"),
+	  ALL_REVS, REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION) },
+};
+
+static struct i915_wa_reg cfl_ctx_was[] = {
+	{ WA_CTX("WaDisableSamplerPowerBypassForSOPingPong"),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(GEN8_SAMPLER_POWER_BYPASS_DIS) },
+
+	{ WA_CTX("WaToEnableHwFixForPushConstHWBug"),
+	  ALL_REVS, REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION) },
+
+	{ WA_CTX("WaDisableSbeCacheDispatchPortSharing"),
+	  ALL_REVS, REG(GEN7_HALF_SLICE_CHICKEN1),
+	  SET_BIT_MASKED(GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE) },
+};
+
+static struct i915_wa_reg cnl_ctx_was[] = {
+	{ WA_CTX("WaForceContextSaveRestoreNonCoherent"),
+	  ALL_REVS, REG(CNL_HDC_CHICKEN0),
+	  SET_BIT_MASKED(HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT) },
+
+	{ WA_CTX("WaThrottleEUPerfToAvoidTDBackPressure (pre-prod)"),
+	  REVS(CNL_REVID_B0, CNL_REVID_B0), REG(GEN8_ROW_CHICKEN),
+	  SET_BIT_MASKED(THROTTLE_12_5) },
+
+	{ WA_CTX("WaDisableReplayBufferBankArbitrationOptimization"),
+	  ALL_REVS, REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION) },
+
+	{ WA_CTX("WaDisableEnhancedSBEVertexCaching (pre-prod)"),
+	  REVS(0, CNL_REVID_B0), REG(COMMON_SLICE_CHICKEN2),
+	  SET_BIT_MASKED(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE) },
+
+	{ WA_CTX("WaPushConstantDereferenceHoldDisable"),
+	  ALL_REVS, REG(GEN7_ROW_CHICKEN2),
+	  SET_BIT_MASKED(PUSH_CONSTANT_DEREF_DISABLE) },
+
+	{ WA_CTX("FtrEnableFastAnisoL1BankingFix"),
+	  ALL_REVS, REG(HALF_SLICE_CHICKEN3),
+	  SET_BIT_MASKED(CNL_FAST_ANISO_L1_BANKING_FIX) },
+
+	{ WA_CTX("WaDisable3DMidCmdPreemption"),
+	  ALL_REVS, REG(GEN8_CS_CHICKEN1),
+	  CLEAR_BIT_MASKED(GEN9_PREEMPT_3D_OBJECT_LEVEL) },
+
+	{ WA_CTX("WaDisableGPGPUMidCmdPreemption"),
+	  ALL_REVS, REG(GEN8_CS_CHICKEN1),
+	  SET_FIELD_MASKED(GEN9_PREEMPT_GPGPU_LEVEL_MASK,
+			   GEN9_PREEMPT_GPGPU_COMMAND_LEVEL) },
+};
+
+static const struct i915_wa_reg_table bdw_ctx_wa_tbl[] = {
+	{ gen8_ctx_was, ARRAY_SIZE(gen8_ctx_was) },
+	{ bdw_ctx_was,  ARRAY_SIZE(bdw_ctx_was) },
+};
+
+static const struct i915_wa_reg_table chv_ctx_wa_tbl[] = {
+	{ gen8_ctx_was, ARRAY_SIZE(gen8_ctx_was) },
+	{ chv_ctx_was,  ARRAY_SIZE(chv_ctx_was) },
+};
+
+static const struct i915_wa_reg_table skl_ctx_wa_tbl[] = {
+	{ gen9_ctx_was, ARRAY_SIZE(gen9_ctx_was) },
+	{ skl_ctx_was,  ARRAY_SIZE(skl_ctx_was) },
+};
+
+static const struct i915_wa_reg_table bxt_ctx_wa_tbl[] = {
+	{ gen9_ctx_was, ARRAY_SIZE(gen9_ctx_was) },
+	{ bxt_ctx_was,  ARRAY_SIZE(bxt_ctx_was) },
+};
+
+static const struct i915_wa_reg_table kbl_ctx_wa_tbl[] = {
+	{ gen9_ctx_was, ARRAY_SIZE(gen9_ctx_was) },
+	{ kbl_ctx_was,  ARRAY_SIZE(kbl_ctx_was) },
+};
+
+static const struct i915_wa_reg_table glk_ctx_wa_tbl[] = {
+	{ gen9_ctx_was, ARRAY_SIZE(gen9_ctx_was) },
+	{ glk_ctx_was,  ARRAY_SIZE(glk_ctx_was) },
+};
+
+static const struct i915_wa_reg_table cfl_ctx_wa_tbl[] = {
+	{ gen9_ctx_was, ARRAY_SIZE(gen9_ctx_was) },
+	{ cfl_ctx_was,  ARRAY_SIZE(cfl_ctx_was) },
+};
+
+static const struct i915_wa_reg_table cnl_ctx_wa_tbl[] = {
+	{ cnl_ctx_was,  ARRAY_SIZE(cnl_ctx_was) },
+};
+
+void intel_ctx_workarounds_get(struct drm_i915_private *dev_priv,
+			       const struct i915_wa_reg_table **wa_table,
+			       uint *table_count)
+{
+	*wa_table = NULL;
+	*table_count = 0;
 
-	/* WaDisableSbeCacheDispatchPortSharing:bxt */
-	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0)) {
-		WA_SET_BIT_MASKED(
-			GEN7_HALF_SLICE_CHICKEN1,
-			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
+	if (INTEL_GEN(dev_priv) < 8)
+		return;
+	else if (IS_BROADWELL(dev_priv)) {
+		*wa_table = bdw_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(bdw_ctx_wa_tbl);
+	} else if (IS_CHERRYVIEW(dev_priv)) {
+		*wa_table = chv_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(chv_ctx_wa_tbl);
+	} else if (IS_SKYLAKE(dev_priv)) {
+		*wa_table = skl_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(skl_ctx_wa_tbl);
+	} else if (IS_BROXTON(dev_priv)) {
+		*wa_table = bxt_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(bxt_ctx_wa_tbl);
+	} else if (IS_KABYLAKE(dev_priv)) {
+		*wa_table = kbl_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(kbl_ctx_wa_tbl);
+	} else if (IS_GEMINILAKE(dev_priv)) {
+		*wa_table = glk_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(glk_ctx_wa_tbl);
+	} else if (IS_COFFEELAKE(dev_priv)) {
+		*wa_table = cfl_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(cfl_ctx_wa_tbl);
+	} else if (IS_CANNONLAKE(dev_priv)) {
+		*wa_table = cnl_ctx_wa_tbl;
+		*table_count = ARRAY_SIZE(cnl_ctx_wa_tbl);
+	} else {
+		MISSING_CASE(INTEL_GEN(dev_priv));
+		return;
 	}
-
-	/* WaToEnableHwFixForPushConstHWBug:bxt */
-	if (IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
-		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	return 0;
-}
-
-static int kbl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
-
-	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
-	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
-		WA_SET_BIT_MASKED(HDC_CHICKEN0,
-				  HDC_FENCE_DEST_SLM_DISABLE);
-
-	/* WaToEnableHwFixForPushConstHWBug:kbl */
-	if (IS_KBL_REVID(dev_priv, KBL_REVID_C0, REVID_FOREVER))
-		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	/* WaDisableSbeCacheDispatchPortSharing:kbl */
-	WA_SET_BIT_MASKED(
-		GEN7_HALF_SLICE_CHICKEN1,
-		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
-
-	return 0;
-}
-
-static int glk_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
-
-	/* WaToEnableHwFixForPushConstHWBug:glk */
-	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	return 0;
-}
-
-static int cfl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
-
-	/* WaToEnableHwFixForPushConstHWBug:cfl */
-	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	/* WaDisableSbeCacheDispatchPortSharing:cfl */
-	WA_SET_BIT_MASKED(
-		GEN7_HALF_SLICE_CHICKEN1,
-		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
-
-	return 0;
 }
 
-static int cnl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static uint ctx_workarounds_init(struct drm_i915_private *dev_priv,
+				 const struct i915_wa_reg_table *wa_table,
+				 uint table_count)
 {
-	/* WaForceContextSaveRestoreNonCoherent:cnl */
-	WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0,
-			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT);
-
-	/* WaThrottleEUPerfToAvoidTDBackPressure:cnl(pre-prod) */
-	if (IS_CNL_REVID(dev_priv, CNL_REVID_B0, CNL_REVID_B0))
-		WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, THROTTLE_12_5);
-
-	/* WaDisableReplayBufferBankArbitrationOptimization:cnl */
-	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
+	uint total_count = 0;
+	int i, j;
 
-	/* WaDisableEnhancedSBEVertexCaching:cnl (pre-prod) */
-	if (IS_CNL_REVID(dev_priv, 0, CNL_REVID_B0))
-		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
-				  GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE);
+	for (i = 0; i < table_count; i++) {
+		struct i915_wa_reg *wa = wa_table[i].table;
 
-	/* WaPushConstantDereferenceHoldDisable:cnl */
-	WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE);
+		for (j = 0; j < wa_table[i].count; j++) {
+			wa[j].applied =
+				IS_REVID(dev_priv, wa[j].since, wa[j].until);
 
-	/* FtrEnableFastAnisoL1BankingFix:cnl */
-	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX);
+			if (wa[j].applied && wa[j].pre_hook)
+				wa[j].applied = wa[j].pre_hook(dev_priv, &wa[j]);
 
-	/* WaDisable3DMidCmdPreemption:cnl */
-	WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
+			/* Post-hooks do not make sense in context WAs */
+			GEM_BUG_ON(wa[j].post_hook);
 
-	/* WaDisableGPGPUMidCmdPreemption:cnl */
-	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
-			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
+			/* We expect all context registers to be masked */
+			GEM_BUG_ON(!wa[j].is_masked_reg);
+			GEM_BUG_ON(wa[j].mask & 0xffff0000);
 
-	return 0;
-}
-
-int intel_ctx_workarounds_init(struct drm_i915_private *dev_priv)
-{
-	int err;
-
-	dev_priv->workarounds.count = 0;
-
-	if (INTEL_GEN(dev_priv) < 8)
-		err = 0;
-	else if (IS_BROADWELL(dev_priv))
-		err = bdw_ctx_workarounds_init(dev_priv);
-	else if (IS_CHERRYVIEW(dev_priv))
-		err = chv_ctx_workarounds_init(dev_priv);
-	else if (IS_SKYLAKE(dev_priv))
-		err = skl_ctx_workarounds_init(dev_priv);
-	else if (IS_BROXTON(dev_priv))
-		err = bxt_ctx_workarounds_init(dev_priv);
-	else if (IS_KABYLAKE(dev_priv))
-		err = kbl_ctx_workarounds_init(dev_priv);
-	else if (IS_GEMINILAKE(dev_priv))
-		err = glk_ctx_workarounds_init(dev_priv);
-	else if (IS_COFFEELAKE(dev_priv))
-		err = cfl_ctx_workarounds_init(dev_priv);
-	else if (IS_CANNONLAKE(dev_priv))
-		err = cnl_ctx_workarounds_init(dev_priv);
-	else {
-		MISSING_CASE(INTEL_GEN(dev_priv));
-		err = 0;
+			if (wa[j].applied)
+				total_count++;
+		}
 	}
-	if (err)
-		return err;
 
-	DRM_DEBUG_DRIVER("Number of context specific w/a: %d\n",
-			 dev_priv->workarounds.count);
-	return 0;
+	dev_priv->workarounds.ctx_count = total_count;
+	DRM_DEBUG_DRIVER("Number of context specific w/a: %u\n", total_count);
+
+	return total_count;
 }
 
 int intel_ctx_workarounds_emit(struct drm_i915_gem_request *req)
 {
-	struct i915_workarounds *w = &req->i915->workarounds;
+	struct drm_i915_private *dev_priv = req->i915;
+	const struct i915_wa_reg_table *wa_table;
+	uint table_count, total_count;
 	u32 *cs;
-	int ret, i;
+	int i, j, ret;
+
+	intel_ctx_workarounds_get(dev_priv, &wa_table, &table_count);
 
-	if (w->count == 0)
+	total_count = ctx_workarounds_init(dev_priv, wa_table, table_count);
+	if (total_count == 0)
 		return 0;
 
 	ret = req->engine->emit_flush(req, EMIT_BARRIER);
 	if (ret)
 		return ret;
 
-	cs = intel_ring_begin(req, (w->count * 2 + 2));
+	cs = intel_ring_begin(req, (total_count * 2 + 2));
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	*cs++ = MI_LOAD_REGISTER_IMM(w->count);
-	for (i = 0; i < w->count; i++) {
-		*cs++ = i915_mmio_reg_offset(w->reg[i].addr);
-		*cs++ = w->reg[i].value;
+	*cs++ = MI_LOAD_REGISTER_IMM(total_count);
+	for (i = 0; i < table_count; i++) {
+		const struct i915_wa_reg *wa = wa_table[i].table;
+
+		for (j = 0; j < wa_table[i].count; j++) {
+			if (!wa[j].applied)
+				continue;
+
+			*cs++ = i915_mmio_reg_offset(wa[j].addr);
+			*cs++ = wa[j].value;
+		}
 	}
 	*cs++ = MI_NOOP;
 
diff --git a/drivers/gpu/drm/i915/intel_workarounds.h b/drivers/gpu/drm/i915/intel_workarounds.h
index bba51bb..38763e7 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.h
+++ b/drivers/gpu/drm/i915/intel_workarounds.h
@@ -25,7 +25,9 @@ 
 #ifndef _I915_WORKAROUNDS_H_
 #define _I915_WORKAROUNDS_H_
 
-int intel_ctx_workarounds_init(struct drm_i915_private *dev_priv);
+void intel_ctx_workarounds_get(struct drm_i915_private *dev_priv,
+                               const struct i915_wa_reg_table **wa_table,
+                               uint *table_count);
 int intel_ctx_workarounds_emit(struct drm_i915_gem_request *req);
 
 void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv);