diff mbox series

drm/i915/selftests: Exercise CS TLB invalidation

Message ID 20190915163749.19540-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series drm/i915/selftests: Exercise CS TLB invalidation | expand

Commit Message

Chris Wilson Sept. 15, 2019, 4:37 p.m. UTC
Check that we are correctly invalidating the TLB at the start of a
batch after updating the GTT.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 227 ++++++++++++++++++
 1 file changed, 227 insertions(+)

Comments

Mika Kuoppala Sept. 19, 2019, 12:57 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Check that we are correctly invalidating the TLB at the start of a
> batch after updating the GTT.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 227 ++++++++++++++++++
>  1 file changed, 227 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 598c18d10640..f8709b332bd7 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -25,13 +25,16 @@
>  #include <linux/list_sort.h>
>  #include <linux/prime_numbers.h>
>  
> +#include "gem/i915_gem_context.h"
>  #include "gem/selftests/mock_context.h"
> +#include "gt/intel_context.h"
>  
>  #include "i915_random.h"
>  #include "i915_selftest.h"
>  
>  #include "mock_drm.h"
>  #include "mock_gem_device.h"
> +#include "igt_flush_test.h"
>  
>  static void cleanup_freed_objects(struct drm_i915_private *i915)
>  {
> @@ -1705,6 +1708,229 @@ int i915_gem_gtt_mock_selftests(void)
>  	return err;
>  }
>  
> +static int context_sync(struct intel_context *ce)
> +{
> +	struct i915_request *rq;
> +	long timeout;
> +
> +	rq = intel_context_create_request(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +
> +	timeout = i915_request_wait(rq, 0, HZ / 5);
> +	i915_request_put(rq);
> +
> +	return timeout < 0 ? -EIO : 0;
> +}
> +
> +static int submit_batch(struct intel_context *ce, u64 addr)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	rq = intel_context_create_request(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	err = 0;
> +	if (rq->engine->emit_init_breadcrumb) /* detect a hang */

for seqno write?

> +		err = rq->engine->emit_init_breadcrumb(rq);
> +	if (err == 0)
> +		err = rq->engine->emit_bb_start(rq, addr, 0, 0);
> +

In context_sync part we grabbed a reference. In here we
don't. I don't see how we can get away without it even
if we don't wait in here.

> +	i915_request_add(rq);
> +
> +	return err;
> +}
> +
> +static int igt_cs_tlb(void *arg)
> +{
> +	const unsigned int count = PAGE_SIZE / 64;
> +	const unsigned int chunk_size = count * PAGE_SIZE;
> +	struct drm_i915_private *i915 = arg;
> +	struct drm_i915_gem_object *bbe, *out;
> +	struct i915_gem_engines_iter it;
> +	struct i915_address_space *vm;
> +	struct i915_gem_context *ctx;
> +	struct intel_context *ce;
> +	struct drm_file *file;
> +	struct i915_vma *vma;
> +	unsigned int i;
> +	u32 *result;
> +	u32 *batch;
> +	int err = 0;
> +
> +	file = mock_file(i915);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	ctx = live_context(i915, file);
> +	if (IS_ERR(ctx)) {
> +		err = PTR_ERR(ctx);
> +		goto out_unlock;
> +	}
> +
> +	vm = ctx->vm;
> +	if (!vm)
> +		goto out_unlock;
> +
> +	bbe = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(bbe)) {
> +		err = PTR_ERR(bbe);
> +		goto out_unlock;
> +	}
> +
> +	batch = i915_gem_object_pin_map(bbe, I915_MAP_WC);
> +	if (IS_ERR(batch)) {
> +		err = PTR_ERR(batch);
> +		goto out_bbe;
> +	}
> +	for (i = 0; i < count; i++) {
> +		u32 *cs = batch + i * 64 / sizeof(*cs);
> +		u64 addr = (vm->total - PAGE_SIZE) + i * sizeof(u32);
> +
> +		GEM_BUG_ON(INTEL_GEN(i915) < 6);
> +		cs[0] = MI_STORE_DWORD_IMM_GEN4;
> +		if (INTEL_GEN(i915) >= 8) {
> +			cs[1] = lower_32_bits(addr);
> +			cs[2] = upper_32_bits(addr);
> +			cs[3] = i;
> +			cs[4] = MI_NOOP;
> +			cs[5] = MI_BATCH_BUFFER_START_GEN8;
> +		} else {
> +			cs[1] = 0;
> +			cs[2] = lower_32_bits(addr);
> +			cs[3] = i;
> +			cs[4] = MI_NOOP;
> +			cs[5] = MI_BATCH_BUFFER_START;
> +		}
> +	}
> +
> +	out = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(out)) {
> +		err = PTR_ERR(out);
> +		goto out_batch;
> +	}
> +	i915_gem_object_set_cache_coherency(out, I915_CACHING_CACHED);
> +
> +	vma = i915_vma_instance(out, vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto out_batch;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0,
> +			   PIN_USER |
> +			   PIN_OFFSET_FIXED |
> +			   (vm->total - PAGE_SIZE));
> +	if (err)
> +		goto out_out;

out_put?

Oh and we don't have to do anything with the instance on
error paths?

> +	GEM_BUG_ON(vma->node.start != vm->total - PAGE_SIZE);
> +
> +	result = i915_gem_object_pin_map(out, I915_MAP_WB);
> +	if (IS_ERR(result)) {
> +		err = PTR_ERR(result);
> +		goto out_out;
> +	}
> +
> +	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
> +		IGT_TIMEOUT(end_time);
> +		unsigned long pass = 0;
> +
> +		if (!intel_engine_can_store_dword(ce->engine))
> +			continue;
> +
> +		while (!__igt_timeout(end_time, NULL)) {
> +			u64 offset;
> +
> +			offset = random_offset(0, vm->total - PAGE_SIZE,
> +					       chunk_size, PAGE_SIZE);
> +
> +			err = vm->allocate_va_range(vm, offset, chunk_size);
> +			if (err)
> +				goto end;
> +
> +			memset32(result, STACK_MAGIC, PAGE_SIZE / sizeof(u32));
> +
> +			vma = i915_vma_instance(bbe, vm, NULL);
> +			if (IS_ERR(vma)) {
> +				err = PTR_ERR(vma);
> +				goto end;
> +			}
> +
> +			err = vma->ops->set_pages(vma);
> +			if (err)
> +				goto end;
> +
> +			/* Replace the TLB with target batches */
> +			for (i = 0; i < count; i++) {
> +				u32 *cs = batch + i * 64 / sizeof(*cs);
> +				u64 addr;
> +
> +				vma->node.start = offset + i * PAGE_SIZE;

on previous loop, we have now primed the pte to tlb in here?

> +				vm->insert_entries(vm, vma, I915_CACHE_NONE, 0);

..now changing in it here...

> +
> +				addr = vma->node.start + i * 64;
> +				cs[4] = MI_NOOP;
> +				cs[6] = lower_32_bits(addr);
> +				cs[7] = upper_32_bits(addr);
> +				wmb();
> +
> +				err = submit_batch(ce, addr);

in hope that with this submission hardware will see the old one and
completely miss the store+bb start?

Perhaps rewiring a more dummy emit only to prove this case
is pushing it.

But I am curious to know if you did try it out by removing
the invalidate on the emits and managed to bring
out the missing writes?

-Mika

> +				if (err)
> +					goto end;
> +			}
> +
> +			yield();
> +			for (i = 0; i < count; i++)
> +				batch[i * 64 / sizeof(*batch) + 4] =
> +					MI_BATCH_BUFFER_END;
> +			wmb();
> +
> +			vma->ops->clear_pages(vma);
> +
> +			err = context_sync(ce);
> +			if (err) {
> +				pr_err("%s: writes timed out\n",
> +				       ce->engine->name);
> +				goto end;
> +			}
> +
> +			for (i = 0; i < count; i++) {
> +				if (result[i] != i) {
> +					pr_err("%s: Write lost on pass %lu, at offset %llx, index %d, found %x, expected %x\n",
> +					       ce->engine->name, pass,
> +					       offset, i, result[i], i);
> +					err = -EINVAL;
> +					goto end;
> +				}
> +			}
> +
> +			vm->clear_range(vm, offset, chunk_size);
> +			pass++;
> +		}
> +	}
> +end:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +	i915_gem_context_unlock_engines(ctx);
> +	i915_gem_object_unpin_map(out);
> +out_out:
> +	i915_gem_object_put(out);
> +out_batch:
> +	i915_gem_object_unpin_map(bbe);
> +out_bbe:
> +	i915_gem_object_put(bbe);
> +out_unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	mock_file_free(i915, file);
> +	return err;
> +}
> +
>  int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
>  {
>  	static const struct i915_subtest tests[] = {
> @@ -1722,6 +1948,7 @@ int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(igt_ggtt_pot),
>  		SUBTEST(igt_ggtt_fill),
>  		SUBTEST(igt_ggtt_page),
> +		SUBTEST(igt_cs_tlb),
>  	};
>  
>  	GEM_BUG_ON(offset_in_page(i915->ggtt.vm.total));
> -- 
> 2.23.0
Chris Wilson Sept. 19, 2019, 1:08 p.m. UTC | #2
Quoting Mika Kuoppala (2019-09-19 13:57:45)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Check that we are correctly invalidating the TLB at the start of a
> > batch after updating the GTT.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > ---
> >  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 227 ++++++++++++++++++
> >  1 file changed, 227 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> > index 598c18d10640..f8709b332bd7 100644
> > --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> > @@ -25,13 +25,16 @@
> >  #include <linux/list_sort.h>
> >  #include <linux/prime_numbers.h>
> >  
> > +#include "gem/i915_gem_context.h"
> >  #include "gem/selftests/mock_context.h"
> > +#include "gt/intel_context.h"
> >  
> >  #include "i915_random.h"
> >  #include "i915_selftest.h"
> >  
> >  #include "mock_drm.h"
> >  #include "mock_gem_device.h"
> > +#include "igt_flush_test.h"
> >  
> >  static void cleanup_freed_objects(struct drm_i915_private *i915)
> >  {
> > @@ -1705,6 +1708,229 @@ int i915_gem_gtt_mock_selftests(void)
> >       return err;
> >  }
> >  
> > +static int context_sync(struct intel_context *ce)
> > +{
> > +     struct i915_request *rq;
> > +     long timeout;
> > +
> > +     rq = intel_context_create_request(ce);
> > +     if (IS_ERR(rq))
> > +             return PTR_ERR(rq);
> > +
> > +     i915_request_get(rq);
> > +     i915_request_add(rq);
> > +
> > +     timeout = i915_request_wait(rq, 0, HZ / 5);
> > +     i915_request_put(rq);
> > +
> > +     return timeout < 0 ? -EIO : 0;
> > +}
> > +
> > +static int submit_batch(struct intel_context *ce, u64 addr)
> > +{
> > +     struct i915_request *rq;
> > +     int err;
> > +
> > +     rq = intel_context_create_request(ce);
> > +     if (IS_ERR(rq))
> > +             return PTR_ERR(rq);
> > +
> > +     err = 0;
> > +     if (rq->engine->emit_init_breadcrumb) /* detect a hang */
> 
> for seqno write?

If we expect an initial breadcrumb, we use it during reset to identify
if we are inside a payload (as opposed to being inside the semaphore
wait). So we need to emit the breadcrumb if we may hang in our batch.

> > +             err = rq->engine->emit_init_breadcrumb(rq);
> > +     if (err == 0)
> > +             err = rq->engine->emit_bb_start(rq, addr, 0, 0);
> > +
> 
> In context_sync part we grabbed a reference. In here we
> don't. I don't see how we can get away without it even
> if we don't wait in here.

Hmm, I suppose you can argue that we do now have a later deref in the
spinner. That wasn't there before...

> > +     vma = i915_vma_instance(out, vm, NULL);
> > +     if (IS_ERR(vma)) {
> > +             err = PTR_ERR(vma);
> > +             goto out_batch;
> > +     }
> > +
> > +     err = i915_vma_pin(vma, 0, 0,
> > +                        PIN_USER |
> > +                        PIN_OFFSET_FIXED |
> > +                        (vm->total - PAGE_SIZE));
> > +     if (err)
> > +             goto out_out;
> 
> out_put?

Joonas likes out: for normal exit paths that double for error handling.

> Oh and we don't have to do anything with the instance on
> error paths?

No. The vma does not yet have an independent lifetime from the object
(they are all owned by objects currently).

> > +                     /* Replace the TLB with target batches */
> > +                     for (i = 0; i < count; i++) {
> > +                             u32 *cs = batch + i * 64 / sizeof(*cs);
> > +                             u64 addr;
> > +
> > +                             vma->node.start = offset + i * PAGE_SIZE;
> 
> on previous loop, we have now primed the pte to tlb in here?

We're using the same PTE as before, in the expectation that the TLB
still contains that lookup.

> > +                             vm->insert_entries(vm, vma, I915_CACHE_NONE, 0);
> 
> ..now changing in it here...
> 
> > +
> > +                             addr = vma->node.start + i * 64;
> > +                             cs[4] = MI_NOOP;
> > +                             cs[6] = lower_32_bits(addr);
> > +                             cs[7] = upper_32_bits(addr);
> > +                             wmb();
> > +
> > +                             err = submit_batch(ce, addr);
> 
> in hope that with this submission hardware will see the old one and
> completely miss the store+bb start?

The hope is we see the right page of course! The test is to detect when
it sees the old page instead.
 
> Perhaps rewiring a more dummy emit only to prove this case
> is pushing it.
> 
> But I am curious to know if you did try it out by removing
> the invalidate on the emits and managed to bring
> out the missing writes?

We can demonstrate it on Tigerlake :)

Indeed it detects the remove of MI_INVALIDATE_TLB elsewhere.
-Chris
Mika Kuoppala Sept. 19, 2019, 1:39 p.m. UTC | #3
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-09-19 13:57:45)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Check that we are correctly invalidating the TLB at the start of a
>> > batch after updating the GTT.
>> >
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 227 ++++++++++++++++++
>> >  1 file changed, 227 insertions(+)
>> >
>> > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
>> > index 598c18d10640..f8709b332bd7 100644
>> > --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
>> > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
>> > @@ -25,13 +25,16 @@
>> >  #include <linux/list_sort.h>
>> >  #include <linux/prime_numbers.h>
>> >  
>> > +#include "gem/i915_gem_context.h"
>> >  #include "gem/selftests/mock_context.h"
>> > +#include "gt/intel_context.h"
>> >  
>> >  #include "i915_random.h"
>> >  #include "i915_selftest.h"
>> >  
>> >  #include "mock_drm.h"
>> >  #include "mock_gem_device.h"
>> > +#include "igt_flush_test.h"
>> >  
>> >  static void cleanup_freed_objects(struct drm_i915_private *i915)
>> >  {
>> > @@ -1705,6 +1708,229 @@ int i915_gem_gtt_mock_selftests(void)
>> >       return err;
>> >  }
>> >  
>> > +static int context_sync(struct intel_context *ce)
>> > +{
>> > +     struct i915_request *rq;
>> > +     long timeout;
>> > +
>> > +     rq = intel_context_create_request(ce);
>> > +     if (IS_ERR(rq))
>> > +             return PTR_ERR(rq);
>> > +
>> > +     i915_request_get(rq);
>> > +     i915_request_add(rq);
>> > +
>> > +     timeout = i915_request_wait(rq, 0, HZ / 5);
>> > +     i915_request_put(rq);
>> > +
>> > +     return timeout < 0 ? -EIO : 0;
>> > +}
>> > +
>> > +static int submit_batch(struct intel_context *ce, u64 addr)
>> > +{
>> > +     struct i915_request *rq;
>> > +     int err;
>> > +
>> > +     rq = intel_context_create_request(ce);
>> > +     if (IS_ERR(rq))
>> > +             return PTR_ERR(rq);
>> > +
>> > +     err = 0;
>> > +     if (rq->engine->emit_init_breadcrumb) /* detect a hang */
>> 
>> for seqno write?
>
> If we expect an initial breadcrumb, we use it during reset to identify
> if we are inside a payload (as opposed to being inside the semaphore
> wait). So we need to emit the breadcrumb if we may hang in our batch.
>
>> > +             err = rq->engine->emit_init_breadcrumb(rq);
>> > +     if (err == 0)
>> > +             err = rq->engine->emit_bb_start(rq, addr, 0, 0);
>> > +
>> 
>> In context_sync part we grabbed a reference. In here we
>> don't. I don't see how we can get away without it even
>> if we don't wait in here.
>
> Hmm, I suppose you can argue that we do now have a later deref in the
> spinner. That wasn't there before...
>
>> > +     vma = i915_vma_instance(out, vm, NULL);
>> > +     if (IS_ERR(vma)) {
>> > +             err = PTR_ERR(vma);
>> > +             goto out_batch;
>> > +     }
>> > +
>> > +     err = i915_vma_pin(vma, 0, 0,
>> > +                        PIN_USER |
>> > +                        PIN_OFFSET_FIXED |
>> > +                        (vm->total - PAGE_SIZE));
>> > +     if (err)
>> > +             goto out_out;
>> 
>> out_put?
>
> Joonas likes out: for normal exit paths that double for error handling.

Ok. Fine with me. I just like the last part to describe what the first part
of onion out does. Don't have to so much scroll back and forth
in editor.

>
>> Oh and we don't have to do anything with the instance on
>> error paths?
>
> No. The vma does not yet have an independent lifetime from the object
> (they are all owned by objects currently).
>
>> > +                     /* Replace the TLB with target batches */
>> > +                     for (i = 0; i < count; i++) {
>> > +                             u32 *cs = batch + i * 64 / sizeof(*cs);
>> > +                             u64 addr;
>> > +
>> > +                             vma->node.start = offset + i * PAGE_SIZE;
>> 
>> on previous loop, we have now primed the pte to tlb in here?
>
> We're using the same PTE as before, in the expectation that the TLB
> still contains that lookup.
>
>> > +                             vm->insert_entries(vm, vma, I915_CACHE_NONE, 0);
>> 
>> ..now changing in it here...
>> 
>> > +
>> > +                             addr = vma->node.start + i * 64;
>> > +                             cs[4] = MI_NOOP;
>> > +                             cs[6] = lower_32_bits(addr);
>> > +                             cs[7] = upper_32_bits(addr);
>> > +                             wmb();
>> > +
>> > +                             err = submit_batch(ce, addr);
>> 
>> in hope that with this submission hardware will see the old one and
>> completely miss the store+bb start?
>
> The hope is we see the right page of course! The test is to detect when
> it sees the old page instead.

Right, I guess it just depends who is hoping wrt to outcome =)

>  
>> Perhaps rewiring a more dummy emit only to prove this case
>> is pushing it.
>> 
>> But I am curious to know if you did try it out by removing
>> the invalidate on the emits and managed to bring
>> out the missing writes?
>
> We can demonstrate it on Tigerlake :)
>
> Indeed it detects the remove of MI_INVALIDATE_TLB elsewhere.

Neat addition to our triaging kit this will be.
-Mika
Chris Wilson Sept. 19, 2019, 1:46 p.m. UTC | #4
Quoting Mika Kuoppala (2019-09-19 14:39:59)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2019-09-19 13:57:45)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> > +     vma = i915_vma_instance(out, vm, NULL);
> >> > +     if (IS_ERR(vma)) {
> >> > +             err = PTR_ERR(vma);
> >> > +             goto out_batch;
> >> > +     }
> >> > +
> >> > +     err = i915_vma_pin(vma, 0, 0,
> >> > +                        PIN_USER |
> >> > +                        PIN_OFFSET_FIXED |
> >> > +                        (vm->total - PAGE_SIZE));
> >> > +     if (err)
> >> > +             goto out_out;
> >> 
> >> out_put?
> >
> > Joonas likes out: for normal exit paths that double for error handling.
> 
> Ok. Fine with me. I just like the last part to describe what the first part
> of onion out does. Don't have to so much scroll back and forth
> in editor.

Oh, I didn't notice I had a typo. I thought it was out_put!
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 598c18d10640..f8709b332bd7 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -25,13 +25,16 @@ 
 #include <linux/list_sort.h>
 #include <linux/prime_numbers.h>
 
+#include "gem/i915_gem_context.h"
 #include "gem/selftests/mock_context.h"
+#include "gt/intel_context.h"
 
 #include "i915_random.h"
 #include "i915_selftest.h"
 
 #include "mock_drm.h"
 #include "mock_gem_device.h"
+#include "igt_flush_test.h"
 
 static void cleanup_freed_objects(struct drm_i915_private *i915)
 {
@@ -1705,6 +1708,229 @@  int i915_gem_gtt_mock_selftests(void)
 	return err;
 }
 
+static int context_sync(struct intel_context *ce)
+{
+	struct i915_request *rq;
+	long timeout;
+
+	rq = intel_context_create_request(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+
+	timeout = i915_request_wait(rq, 0, HZ / 5);
+	i915_request_put(rq);
+
+	return timeout < 0 ? -EIO : 0;
+}
+
+static int submit_batch(struct intel_context *ce, u64 addr)
+{
+	struct i915_request *rq;
+	int err;
+
+	rq = intel_context_create_request(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	err = 0;
+	if (rq->engine->emit_init_breadcrumb) /* detect a hang */
+		err = rq->engine->emit_init_breadcrumb(rq);
+	if (err == 0)
+		err = rq->engine->emit_bb_start(rq, addr, 0, 0);
+
+	i915_request_add(rq);
+
+	return err;
+}
+
+static int igt_cs_tlb(void *arg)
+{
+	const unsigned int count = PAGE_SIZE / 64;
+	const unsigned int chunk_size = count * PAGE_SIZE;
+	struct drm_i915_private *i915 = arg;
+	struct drm_i915_gem_object *bbe, *out;
+	struct i915_gem_engines_iter it;
+	struct i915_address_space *vm;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	struct drm_file *file;
+	struct i915_vma *vma;
+	unsigned int i;
+	u32 *result;
+	u32 *batch;
+	int err = 0;
+
+	file = mock_file(i915);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	mutex_lock(&i915->drm.struct_mutex);
+	ctx = live_context(i915, file);
+	if (IS_ERR(ctx)) {
+		err = PTR_ERR(ctx);
+		goto out_unlock;
+	}
+
+	vm = ctx->vm;
+	if (!vm)
+		goto out_unlock;
+
+	bbe = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(bbe)) {
+		err = PTR_ERR(bbe);
+		goto out_unlock;
+	}
+
+	batch = i915_gem_object_pin_map(bbe, I915_MAP_WC);
+	if (IS_ERR(batch)) {
+		err = PTR_ERR(batch);
+		goto out_bbe;
+	}
+	for (i = 0; i < count; i++) {
+		u32 *cs = batch + i * 64 / sizeof(*cs);
+		u64 addr = (vm->total - PAGE_SIZE) + i * sizeof(u32);
+
+		GEM_BUG_ON(INTEL_GEN(i915) < 6);
+		cs[0] = MI_STORE_DWORD_IMM_GEN4;
+		if (INTEL_GEN(i915) >= 8) {
+			cs[1] = lower_32_bits(addr);
+			cs[2] = upper_32_bits(addr);
+			cs[3] = i;
+			cs[4] = MI_NOOP;
+			cs[5] = MI_BATCH_BUFFER_START_GEN8;
+		} else {
+			cs[1] = 0;
+			cs[2] = lower_32_bits(addr);
+			cs[3] = i;
+			cs[4] = MI_NOOP;
+			cs[5] = MI_BATCH_BUFFER_START;
+		}
+	}
+
+	out = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(out)) {
+		err = PTR_ERR(out);
+		goto out_batch;
+	}
+	i915_gem_object_set_cache_coherency(out, I915_CACHING_CACHED);
+
+	vma = i915_vma_instance(out, vm, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto out_batch;
+	}
+
+	err = i915_vma_pin(vma, 0, 0,
+			   PIN_USER |
+			   PIN_OFFSET_FIXED |
+			   (vm->total - PAGE_SIZE));
+	if (err)
+		goto out_out;
+	GEM_BUG_ON(vma->node.start != vm->total - PAGE_SIZE);
+
+	result = i915_gem_object_pin_map(out, I915_MAP_WB);
+	if (IS_ERR(result)) {
+		err = PTR_ERR(result);
+		goto out_out;
+	}
+
+	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
+		IGT_TIMEOUT(end_time);
+		unsigned long pass = 0;
+
+		if (!intel_engine_can_store_dword(ce->engine))
+			continue;
+
+		while (!__igt_timeout(end_time, NULL)) {
+			u64 offset;
+
+			offset = random_offset(0, vm->total - PAGE_SIZE,
+					       chunk_size, PAGE_SIZE);
+
+			err = vm->allocate_va_range(vm, offset, chunk_size);
+			if (err)
+				goto end;
+
+			memset32(result, STACK_MAGIC, PAGE_SIZE / sizeof(u32));
+
+			vma = i915_vma_instance(bbe, vm, NULL);
+			if (IS_ERR(vma)) {
+				err = PTR_ERR(vma);
+				goto end;
+			}
+
+			err = vma->ops->set_pages(vma);
+			if (err)
+				goto end;
+
+			/* Replace the TLB with target batches */
+			for (i = 0; i < count; i++) {
+				u32 *cs = batch + i * 64 / sizeof(*cs);
+				u64 addr;
+
+				vma->node.start = offset + i * PAGE_SIZE;
+				vm->insert_entries(vm, vma, I915_CACHE_NONE, 0);
+
+				addr = vma->node.start + i * 64;
+				cs[4] = MI_NOOP;
+				cs[6] = lower_32_bits(addr);
+				cs[7] = upper_32_bits(addr);
+				wmb();
+
+				err = submit_batch(ce, addr);
+				if (err)
+					goto end;
+			}
+
+			yield();
+			for (i = 0; i < count; i++)
+				batch[i * 64 / sizeof(*batch) + 4] =
+					MI_BATCH_BUFFER_END;
+			wmb();
+
+			vma->ops->clear_pages(vma);
+
+			err = context_sync(ce);
+			if (err) {
+				pr_err("%s: writes timed out\n",
+				       ce->engine->name);
+				goto end;
+			}
+
+			for (i = 0; i < count; i++) {
+				if (result[i] != i) {
+					pr_err("%s: Write lost on pass %lu, at offset %llx, index %d, found %x, expected %x\n",
+					       ce->engine->name, pass,
+					       offset, i, result[i], i);
+					err = -EINVAL;
+					goto end;
+				}
+			}
+
+			vm->clear_range(vm, offset, chunk_size);
+			pass++;
+		}
+	}
+end:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	i915_gem_context_unlock_engines(ctx);
+	i915_gem_object_unpin_map(out);
+out_out:
+	i915_gem_object_put(out);
+out_batch:
+	i915_gem_object_unpin_map(bbe);
+out_bbe:
+	i915_gem_object_put(bbe);
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	mock_file_free(i915, file);
+	return err;
+}
+
 int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -1722,6 +1948,7 @@  int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(igt_ggtt_pot),
 		SUBTEST(igt_ggtt_fill),
 		SUBTEST(igt_ggtt_page),
+		SUBTEST(igt_cs_tlb),
 	};
 
 	GEM_BUG_ON(offset_in_page(i915->ggtt.vm.total));