Revert "drm/i915: use a separate context for gpu relocs"
diff mbox series

Message ID 20191129124846.949100-1-chris@chris-wilson.co.uk
State New
Headers show
Series
  • Revert "drm/i915: use a separate context for gpu relocs"
Related show

Commit Message

Chris Wilson Nov. 29, 2019, 12:48 p.m. UTC
Since commit c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT
invalidations"), we now disable the advanced preparser on Tigerlake for the
invalidation phase at the start of the batch, we no longer need to emit
the GPU relocations from a second context as they are now flushed inlined.

References: 8a9a982767b7 ("drm/i915: use a separate context for gpu relocs")
References: c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT invalidations")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 30 +------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

Comments

Mika Kuoppala Nov. 29, 2019, 1:07 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since commit c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT
> invalidations"), we now disable the advanced preparser on Tigerlake for the
> invalidation phase at the start of the batch, we no longer need to emit
> the GPU relocations from a second context as they are now flushed inlined.
>
> References: 8a9a982767b7 ("drm/i915: use a separate context for gpu relocs")
> References: c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT invalidations")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Not a complete revert, taking care of preserving the valuable
comment about preparser before gen12_emit_preempt_busywait().

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 30 +------------------
>  1 file changed, 1 insertion(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 7a87e8270460..459f4d40b69b 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -253,7 +253,6 @@ struct i915_execbuffer {
>  		bool has_fence : 1;
>  		bool needs_unfenced : 1;
>  
> -		struct intel_context *ce;
>  		struct i915_request *rq;
>  		u32 *rq_cmd;
>  		unsigned int rq_size;
> @@ -886,9 +885,6 @@ static void eb_destroy(const struct i915_execbuffer *eb)
>  {
>  	GEM_BUG_ON(eb->reloc_cache.rq);
>  
> -	if (eb->reloc_cache.ce)
> -		intel_context_put(eb->reloc_cache.ce);
> -
>  	if (eb->lut_size > 0)
>  		kfree(eb->buckets);
>  }
> @@ -912,7 +908,6 @@ static void reloc_cache_init(struct reloc_cache *cache,
>  	cache->has_fence = cache->gen < 4;
>  	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
>  	cache->node.flags = 0;
> -	cache->ce = NULL;
>  	cache->rq = NULL;
>  	cache->rq_size = 0;
>  }
> @@ -1182,7 +1177,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>  	if (err)
>  		goto err_unmap;
>  
> -	rq = intel_context_create_request(cache->ce);
> +	rq = i915_request_create(eb->context);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
>  		goto err_unpin;
> @@ -1253,29 +1248,6 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
>  		if (!intel_engine_can_store_dword(eb->engine))
>  			return ERR_PTR(-ENODEV);
>  
> -		if (!cache->ce) {
> -			struct intel_context *ce;
> -
> -			/*
> -			 * The CS pre-parser can pre-fetch commands across
> -			 * memory sync points and starting gen12 it is able to
> -			 * pre-fetch across BB_START and BB_END boundaries
> -			 * (within the same context). We therefore use a
> -			 * separate context gen12+ to guarantee that the reloc
> -			 * writes land before the parser gets to the target
> -			 * memory location.
> -			 */
> -			if (cache->gen >= 12)
> -				ce = intel_context_create(eb->context->gem_context,
> -							  eb->engine);
> -			else
> -				ce = intel_context_get(eb->context);
> -			if (IS_ERR(ce))
> -				return ERR_CAST(ce);
> -
> -			cache->ce = ce;
> -		}
> -
>  		err = __reloc_gpu_alloc(eb, vma, len);
>  		if (unlikely(err))
>  			return ERR_PTR(err);
> -- 
> 2.24.0
Daniele Ceraolo Spurio Dec. 3, 2019, 10:19 p.m. UTC | #2
On 11/29/19 4:48 AM, Chris Wilson wrote:
> Since commit c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT
> invalidations"), we now disable the advanced preparser on Tigerlake for the
> invalidation phase at the start of the batch, we no longer need to emit
> the GPU relocations from a second context as they are now flushed inlined.
> 

c45e788d95b4 only applies to the RCS though and IIRC I've seen issues 
with the relocations on other engines as well, although they were much 
rarer. Also, the comment left in intel_lrc.c still references reloc_gpu().

Daniele

> References: 8a9a982767b7 ("drm/i915: use a separate context for gpu relocs")
> References: c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT invalidations")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 30 +------------------
>   1 file changed, 1 insertion(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index 7a87e8270460..459f4d40b69b 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -253,7 +253,6 @@ struct i915_execbuffer {
>   		bool has_fence : 1;
>   		bool needs_unfenced : 1;
>   
> -		struct intel_context *ce;
>   		struct i915_request *rq;
>   		u32 *rq_cmd;
>   		unsigned int rq_size;
> @@ -886,9 +885,6 @@ static void eb_destroy(const struct i915_execbuffer *eb)
>   {
>   	GEM_BUG_ON(eb->reloc_cache.rq);
>   
> -	if (eb->reloc_cache.ce)
> -		intel_context_put(eb->reloc_cache.ce);
> -
>   	if (eb->lut_size > 0)
>   		kfree(eb->buckets);
>   }
> @@ -912,7 +908,6 @@ static void reloc_cache_init(struct reloc_cache *cache,
>   	cache->has_fence = cache->gen < 4;
>   	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
>   	cache->node.flags = 0;
> -	cache->ce = NULL;
>   	cache->rq = NULL;
>   	cache->rq_size = 0;
>   }
> @@ -1182,7 +1177,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>   	if (err)
>   		goto err_unmap;
>   
> -	rq = intel_context_create_request(cache->ce);
> +	rq = i915_request_create(eb->context);
>   	if (IS_ERR(rq)) {
>   		err = PTR_ERR(rq);
>   		goto err_unpin;
> @@ -1253,29 +1248,6 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
>   		if (!intel_engine_can_store_dword(eb->engine))
>   			return ERR_PTR(-ENODEV);
>   
> -		if (!cache->ce) {
> -			struct intel_context *ce;
> -
> -			/*
> -			 * The CS pre-parser can pre-fetch commands across
> -			 * memory sync points and starting gen12 it is able to
> -			 * pre-fetch across BB_START and BB_END boundaries
> -			 * (within the same context). We therefore use a
> -			 * separate context gen12+ to guarantee that the reloc
> -			 * writes land before the parser gets to the target
> -			 * memory location.
> -			 */
> -			if (cache->gen >= 12)
> -				ce = intel_context_create(eb->context->gem_context,
> -							  eb->engine);
> -			else
> -				ce = intel_context_get(eb->context);
> -			if (IS_ERR(ce))
> -				return ERR_CAST(ce);
> -
> -			cache->ce = ce;
> -		}
> -
>   		err = __reloc_gpu_alloc(eb, vma, len);
>   		if (unlikely(err))
>   			return ERR_PTR(err);
>
Chris Wilson Dec. 4, 2019, 6:21 p.m. UTC | #3
Quoting Daniele Ceraolo Spurio (2019-12-03 22:19:07)
> 
> 
> On 11/29/19 4:48 AM, Chris Wilson wrote:
> > Since commit c45e788d95b4 ("drm/i915/tgl: Suspend pre-parser across GTT
> > invalidations"), we now disable the advanced preparser on Tigerlake for the
> > invalidation phase at the start of the batch, we no longer need to emit
> > the GPU relocations from a second context as they are now flushed inlined.
> > 
> 
> c45e788d95b4 only applies to the RCS though and IIRC I've seen issues 
> with the relocations on other engines as well, although they were much 
> rarer. Also, the comment left in intel_lrc.c still references reloc_gpu().

The tests we have don't discriminate between the engines, and the
so far we've only observed the issue on RCS. Look at igt_cs_tlb and see
what it might be missing wrt to triggering the issue on the other
engines.
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 7a87e8270460..459f4d40b69b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -253,7 +253,6 @@  struct i915_execbuffer {
 		bool has_fence : 1;
 		bool needs_unfenced : 1;
 
-		struct intel_context *ce;
 		struct i915_request *rq;
 		u32 *rq_cmd;
 		unsigned int rq_size;
@@ -886,9 +885,6 @@  static void eb_destroy(const struct i915_execbuffer *eb)
 {
 	GEM_BUG_ON(eb->reloc_cache.rq);
 
-	if (eb->reloc_cache.ce)
-		intel_context_put(eb->reloc_cache.ce);
-
 	if (eb->lut_size > 0)
 		kfree(eb->buckets);
 }
@@ -912,7 +908,6 @@  static void reloc_cache_init(struct reloc_cache *cache,
 	cache->has_fence = cache->gen < 4;
 	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
 	cache->node.flags = 0;
-	cache->ce = NULL;
 	cache->rq = NULL;
 	cache->rq_size = 0;
 }
@@ -1182,7 +1177,7 @@  static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 	if (err)
 		goto err_unmap;
 
-	rq = intel_context_create_request(cache->ce);
+	rq = i915_request_create(eb->context);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto err_unpin;
@@ -1253,29 +1248,6 @@  static u32 *reloc_gpu(struct i915_execbuffer *eb,
 		if (!intel_engine_can_store_dword(eb->engine))
 			return ERR_PTR(-ENODEV);
 
-		if (!cache->ce) {
-			struct intel_context *ce;
-
-			/*
-			 * The CS pre-parser can pre-fetch commands across
-			 * memory sync points and starting gen12 it is able to
-			 * pre-fetch across BB_START and BB_END boundaries
-			 * (within the same context). We therefore use a
-			 * separate context gen12+ to guarantee that the reloc
-			 * writes land before the parser gets to the target
-			 * memory location.
-			 */
-			if (cache->gen >= 12)
-				ce = intel_context_create(eb->context->gem_context,
-							  eb->engine);
-			else
-				ce = intel_context_get(eb->context);
-			if (IS_ERR(ce))
-				return ERR_CAST(ce);
-
-			cache->ce = ce;
-		}
-
 		err = __reloc_gpu_alloc(eb, vma, len);
 		if (unlikely(err))
 			return ERR_PTR(err);