diff mbox series

[v4,5/5] drm/i915/gt: Make sure that errors are propagated through request chains

Message ID 20230308094106.203686-6-andi.shyti@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Fix error propagation amongst request | expand

Commit Message

Andi Shyti March 8, 2023, 9:41 a.m. UTC
Currently, when we perform operations such as clearing or copying
large blocks of memory, we generate multiple requests that are
executed in a chain.

However, if one of these requests fails, we may not realize it
unless it happens to be the last request in the chain. This is
because errors are not properly propagated.

For this we need to keep propagating the chain of fence
notification in order to always reach the final fence associated
to the final request.

To address this issue, we need to ensure that the chain of fence
notifications is always propagated so that we can reach the final
fence associated with the last request. By doing so, we will be
able to detect any memory operation  failures and determine
whether the memory is still invalid.

On copy and clear migration signal fences upon completion.

On copy and clear migration, signal fences upon request
completion to ensure that we have a reliable perpetuation of the
operation outcome.

Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
Reported-by: Matthew Auld <matthew.auld@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Cc: stable@vger.kernel.org
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 41 ++++++++++++++++++-------
 1 file changed, 30 insertions(+), 11 deletions(-)

Comments

Matthew Auld March 10, 2023, 10:03 a.m. UTC | #1
On 08/03/2023 09:41, Andi Shyti wrote:
> Currently, when we perform operations such as clearing or copying
> large blocks of memory, we generate multiple requests that are
> executed in a chain.
> 
> However, if one of these requests fails, we may not realize it
> unless it happens to be the last request in the chain. This is
> because errors are not properly propagated.
> 
> For this we need to keep propagating the chain of fence
> notification in order to always reach the final fence associated
> to the final request.
> 
> To address this issue, we need to ensure that the chain of fence
> notifications is always propagated so that we can reach the final
> fence associated with the last request. By doing so, we will be
> able to detect any memory operation  failures and determine
> whether the memory is still invalid.
> 
> On copy and clear migration signal fences upon completion.
> 
> On copy and clear migration, signal fences upon request
> completion to ensure that we have a reliable perpetuation of the
> operation outcome.
> 
> Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
> Reported-by: Matthew Auld <matthew.auld@intel.com>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: stable@vger.kernel.org
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_migrate.c | 41 ++++++++++++++++++-------
>   1 file changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> index 3f638f1987968..0031e7b1b4704 100644
> --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -742,13 +742,19 @@ intel_context_migrate_copy(struct intel_context *ce,
>   			dst_offset = 2 * CHUNK_SZ;
>   	}
>   
> +	/*
> +	 * While building the chain of requests, we need to ensure
> +	 * that no one can sneak into the timeline unnoticed.
> +	 */
> +	mutex_lock(&ce->timeline->mutex);
> +

Hmm, this looks different/new from the previous version. Why do we only 
do this for the copy and not the clear btw? Both should be conceptually 
the same. Sorry if I'm misunderstanding something here.

>   	do {
>   		int len;
>   
> -		rq = i915_request_create(ce);
> +		rq = i915_request_create_locked(ce);
>   		if (IS_ERR(rq)) {
>   			err = PTR_ERR(rq);
> -			goto out_ce;
> +			break;
>   		}
>   
>   		if (deps) {
> @@ -878,10 +884,14 @@ intel_context_migrate_copy(struct intel_context *ce,
>   
>   		/* Arbitration is re-enabled between requests. */
>   out_rq:
> -		if (*out)
> +		i915_sw_fence_await(&rq->submit);
> +		i915_request_get(rq);
> +		i915_request_add_locked(rq);
> +		if (*out) {
> +			i915_sw_fence_complete(&(*out)->submit);
>   			i915_request_put(*out);
> -		*out = i915_request_get(rq);
> -		i915_request_add(rq);
> +		}
> +		*out = rq;
>   
>   		if (err)
>   			break;
> @@ -905,7 +915,10 @@ intel_context_migrate_copy(struct intel_context *ce,
>   		cond_resched();
>   	} while (1);
>   
> -out_ce:
> +	mutex_unlock(&ce->timeline->mutex);
> +
> +	if (*out)
> +		i915_sw_fence_complete(&(*out)->submit);
>   	return err;
>   }
>   
> @@ -1005,7 +1018,7 @@ intel_context_migrate_clear(struct intel_context *ce,
>   		rq = i915_request_create(ce);
>   		if (IS_ERR(rq)) {
>   			err = PTR_ERR(rq);
> -			goto out_ce;
> +			break;
>   		}
>   
>   		if (deps) {
> @@ -1056,17 +1069,23 @@ intel_context_migrate_clear(struct intel_context *ce,
>   
>   		/* Arbitration is re-enabled between requests. */
>   out_rq:
> -		if (*out)
> -			i915_request_put(*out);
> -		*out = i915_request_get(rq);
> +		i915_sw_fence_await(&rq->submit);
> +		i915_request_get(rq);
>   		i915_request_add(rq);
> +		if (*out) {
> +			i915_sw_fence_complete(&(*out)->submit);
> +			i915_request_put(*out);
> +		}
> +		*out = rq;
> +
>   		if (err || !it.sg || !sg_dma_len(it.sg))
>   			break;
>   
>   		cond_resched();
>   	} while (1);
>   
> -out_ce:
> +	if (*out)
> +		i915_sw_fence_complete(&(*out)->submit);
>   	return err;
>   }
>
Nirmoy Das April 11, 2023, 6:39 a.m. UTC | #2
On 3/8/2023 10:41 AM, Andi Shyti wrote:
> Currently, when we perform operations such as clearing or copying
> large blocks of memory, we generate multiple requests that are
> executed in a chain.
>
> However, if one of these requests fails, we may not realize it
> unless it happens to be the last request in the chain. This is
> because errors are not properly propagated.
>
> For this we need to keep propagating the chain of fence
> notification in order to always reach the final fence associated
> to the final request.
>
> To address this issue, we need to ensure that the chain of fence
> notifications is always propagated so that we can reach the final
> fence associated with the last request. By doing so, we will be
> able to detect any memory operation  failures and determine
> whether the memory is still invalid.
>
> On copy and clear migration signal fences upon completion.
>
> On copy and clear migration, signal fences upon request
> completion to ensure that we have a reliable perpetuation of the
> operation outcome.
>
> Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
> Reported-by: Matthew Auld <matthew.auld@intel.com>
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: stable@vger.kernel.org
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
With  Matt's comment regarding missing lock in 
intel_context_migrate_clear addressed, this is:

Acked-by: Nirmoy Das <nirmoy.das@intel.com>

> ---
>   drivers/gpu/drm/i915/gt/intel_migrate.c | 41 ++++++++++++++++++-------
>   1 file changed, 30 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> index 3f638f1987968..0031e7b1b4704 100644
> --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -742,13 +742,19 @@ intel_context_migrate_copy(struct intel_context *ce,
>   			dst_offset = 2 * CHUNK_SZ;
>   	}
>   
> +	/*
> +	 * While building the chain of requests, we need to ensure
> +	 * that no one can sneak into the timeline unnoticed.
> +	 */
> +	mutex_lock(&ce->timeline->mutex);
> +
>   	do {
>   		int len;
>   
> -		rq = i915_request_create(ce);
> +		rq = i915_request_create_locked(ce);
>   		if (IS_ERR(rq)) {
>   			err = PTR_ERR(rq);
> -			goto out_ce;
> +			break;
>   		}
>   
>   		if (deps) {
> @@ -878,10 +884,14 @@ intel_context_migrate_copy(struct intel_context *ce,
>   
>   		/* Arbitration is re-enabled between requests. */
>   out_rq:
> -		if (*out)
> +		i915_sw_fence_await(&rq->submit);
> +		i915_request_get(rq);
> +		i915_request_add_locked(rq);
> +		if (*out) {
> +			i915_sw_fence_complete(&(*out)->submit);
>   			i915_request_put(*out);
> -		*out = i915_request_get(rq);
> -		i915_request_add(rq);
> +		}
> +		*out = rq;
>   
>   		if (err)
>   			break;
> @@ -905,7 +915,10 @@ intel_context_migrate_copy(struct intel_context *ce,
>   		cond_resched();
>   	} while (1);
>   
> -out_ce:
> +	mutex_unlock(&ce->timeline->mutex);
> +
> +	if (*out)
> +		i915_sw_fence_complete(&(*out)->submit);
>   	return err;
>   }
>   
> @@ -1005,7 +1018,7 @@ intel_context_migrate_clear(struct intel_context *ce,
>   		rq = i915_request_create(ce);
>   		if (IS_ERR(rq)) {
>   			err = PTR_ERR(rq);
> -			goto out_ce;
> +			break;
>   		}
>   
>   		if (deps) {
> @@ -1056,17 +1069,23 @@ intel_context_migrate_clear(struct intel_context *ce,
>   
>   		/* Arbitration is re-enabled between requests. */
>   out_rq:
> -		if (*out)
> -			i915_request_put(*out);
> -		*out = i915_request_get(rq);
> +		i915_sw_fence_await(&rq->submit);
> +		i915_request_get(rq);
>   		i915_request_add(rq);
> +		if (*out) {
> +			i915_sw_fence_complete(&(*out)->submit);
> +			i915_request_put(*out);
> +		}
> +		*out = rq;
> +
>   		if (err || !it.sg || !sg_dma_len(it.sg))
>   			break;
>   
>   		cond_resched();
>   	} while (1);
>   
> -out_ce:
> +	if (*out)
> +		i915_sw_fence_complete(&(*out)->submit);
>   	return err;
>   }
>
Rodrigo Vivi April 11, 2023, 2:35 p.m. UTC | #3
On Tue, Apr 11, 2023 at 08:39:00AM +0200, Das, Nirmoy wrote:
> 
> On 3/8/2023 10:41 AM, Andi Shyti wrote:
> > Currently, when we perform operations such as clearing or copying
> > large blocks of memory, we generate multiple requests that are
> > executed in a chain.
> > 
> > However, if one of these requests fails, we may not realize it
> > unless it happens to be the last request in the chain. This is
> > because errors are not properly propagated.
> > 
> > For this we need to keep propagating the chain of fence
> > notification in order to always reach the final fence associated
> > to the final request.
> > 
> > To address this issue, we need to ensure that the chain of fence
> > notifications is always propagated so that we can reach the final
> > fence associated with the last request. By doing so, we will be
> > able to detect any memory operation  failures and determine
> > whether the memory is still invalid.
> > 
> > On copy and clear migration signal fences upon completion.
> > 
> > On copy and clear migration, signal fences upon request
> > completion to ensure that we have a reliable perpetuation of the
> > operation outcome.
> > 
> > Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
> > Reported-by: Matthew Auld <matthew.auld@intel.com>
> > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > Cc: stable@vger.kernel.org
> > Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> With  Matt's comment regarding missing lock in intel_context_migrate_clear
> addressed, this is:
> 
> Acked-by: Nirmoy Das <nirmoy.das@intel.com>

Nack!

Please get some ack from Joonas or Tvrtko before merging this series.

It is a big series targeting stable o.O where the revisions in the cover
letter are not helping me to be confident that this is the right approach
instead of simply reverting the original offending commit:

cf586021642d ("drm/i915/gt: Pipelined page migration")

It looks to me that we are adding magic on top of magic to workaround
the deadlocks, but then adding more waits inside locks... And this with
the hang checks vs heartbeats, is this really an issue on current upstream
code? or was only on DII?

Where was the bug report to start with?

> 
> > ---
> >   drivers/gpu/drm/i915/gt/intel_migrate.c | 41 ++++++++++++++++++-------
> >   1 file changed, 30 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 3f638f1987968..0031e7b1b4704 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -742,13 +742,19 @@ intel_context_migrate_copy(struct intel_context *ce,
> >   			dst_offset = 2 * CHUNK_SZ;
> >   	}
> > +	/*
> > +	 * While building the chain of requests, we need to ensure
> > +	 * that no one can sneak into the timeline unnoticed.
> > +	 */
> > +	mutex_lock(&ce->timeline->mutex);
> > +
> >   	do {
> >   		int len;
> > -		rq = i915_request_create(ce);
> > +		rq = i915_request_create_locked(ce);
> >   		if (IS_ERR(rq)) {
> >   			err = PTR_ERR(rq);
> > -			goto out_ce;
> > +			break;
> >   		}
> >   		if (deps) {
> > @@ -878,10 +884,14 @@ intel_context_migrate_copy(struct intel_context *ce,
> >   		/* Arbitration is re-enabled between requests. */
> >   out_rq:
> > -		if (*out)
> > +		i915_sw_fence_await(&rq->submit);
> > +		i915_request_get(rq);
> > +		i915_request_add_locked(rq);
> > +		if (*out) {
> > +			i915_sw_fence_complete(&(*out)->submit);
> >   			i915_request_put(*out);
> > -		*out = i915_request_get(rq);
> > -		i915_request_add(rq);
> > +		}
> > +		*out = rq;
> >   		if (err)
> >   			break;
> > @@ -905,7 +915,10 @@ intel_context_migrate_copy(struct intel_context *ce,
> >   		cond_resched();
> >   	} while (1);
> > -out_ce:
> > +	mutex_unlock(&ce->timeline->mutex);
> > +
> > +	if (*out)
> > +		i915_sw_fence_complete(&(*out)->submit);
> >   	return err;
> >   }
> > @@ -1005,7 +1018,7 @@ intel_context_migrate_clear(struct intel_context *ce,
> >   		rq = i915_request_create(ce);
> >   		if (IS_ERR(rq)) {
> >   			err = PTR_ERR(rq);
> > -			goto out_ce;
> > +			break;
> >   		}
> >   		if (deps) {
> > @@ -1056,17 +1069,23 @@ intel_context_migrate_clear(struct intel_context *ce,
> >   		/* Arbitration is re-enabled between requests. */
> >   out_rq:
> > -		if (*out)
> > -			i915_request_put(*out);
> > -		*out = i915_request_get(rq);
> > +		i915_sw_fence_await(&rq->submit);
> > +		i915_request_get(rq);
> >   		i915_request_add(rq);
> > +		if (*out) {
> > +			i915_sw_fence_complete(&(*out)->submit);
> > +			i915_request_put(*out);
> > +		}
> > +		*out = rq;
> > +
> >   		if (err || !it.sg || !sg_dma_len(it.sg))
> >   			break;
> >   		cond_resched();
> >   	} while (1);
> > -out_ce:
> > +	if (*out)
> > +		i915_sw_fence_complete(&(*out)->submit);
> >   	return err;
> >   }
Andi Shyti April 12, 2023, 10:56 a.m. UTC | #4
Hi Rodrigo,

> > > Currently, when we perform operations such as clearing or copying
> > > large blocks of memory, we generate multiple requests that are
> > > executed in a chain.
> > > 
> > > However, if one of these requests fails, we may not realize it
> > > unless it happens to be the last request in the chain. This is
> > > because errors are not properly propagated.
> > > 
> > > For this we need to keep propagating the chain of fence
> > > notification in order to always reach the final fence associated
> > > to the final request.
> > > 
> > > To address this issue, we need to ensure that the chain of fence
> > > notifications is always propagated so that we can reach the final
> > > fence associated with the last request. By doing so, we will be
> > > able to detect any memory operation  failures and determine
> > > whether the memory is still invalid.
> > > 
> > > On copy and clear migration signal fences upon completion.
> > > 
> > > On copy and clear migration, signal fences upon request
> > > completion to ensure that we have a reliable perpetuation of the
> > > operation outcome.
> > > 
> > > Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
> > > Reported-by: Matthew Auld <matthew.auld@intel.com>
> > > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > Cc: stable@vger.kernel.org
> > > Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> > With  Matt's comment regarding missing lock in intel_context_migrate_clear
> > addressed, this is:
> > 
> > Acked-by: Nirmoy Das <nirmoy.das@intel.com>
> 
> Nack!
> 
> Please get some ack from Joonas or Tvrtko before merging this series.

There is no architectural change... of course, Joonas and Tvrtko
are more than welcome (and actually invited) to look into this
patch.

And, btw, there are still some discussions ongoing on this whole
series, so that I'm not going to merge it any time soon. I'm just
happy to revive the discussion.

> It is a big series targeting stable o.O where the revisions in the cover
> letter are not helping me to be confident that this is the right approach
> instead of simply reverting the original offending commit:
> 
> cf586021642d ("drm/i915/gt: Pipelined page migration")

Why should we remove all the migration completely? What about the
copy?

> It looks to me that we are adding magic on top of magic to workaround
> the deadlocks, but then adding more waits inside locks... And this with
> the hang checks vs heartbeats, is this really an issue on current upstream
> code? or was only on DII?

There is no real magic happening here. It's just that the error
message was not reaching the end of the operation while this
patch is passing it over.

> Where was the bug report to start with?

Matt has reported this, I will give to you the necessary links to
it offline.

Thanks for looking into this,
Andi
Rodrigo Vivi April 12, 2023, 1:10 p.m. UTC | #5
On Wed, Apr 12, 2023 at 12:56:26PM +0200, Andi Shyti wrote:
> Hi Rodrigo,
> 
> > > > Currently, when we perform operations such as clearing or copying
> > > > large blocks of memory, we generate multiple requests that are
> > > > executed in a chain.
> > > > 
> > > > However, if one of these requests fails, we may not realize it
> > > > unless it happens to be the last request in the chain. This is
> > > > because errors are not properly propagated.
> > > > 
> > > > For this we need to keep propagating the chain of fence
> > > > notification in order to always reach the final fence associated
> > > > to the final request.
> > > > 
> > > > To address this issue, we need to ensure that the chain of fence
> > > > notifications is always propagated so that we can reach the final
> > > > fence associated with the last request. By doing so, we will be
> > > > able to detect any memory operation  failures and determine
> > > > whether the memory is still invalid.
> > > > 
> > > > On copy and clear migration signal fences upon completion.
> > > > 
> > > > On copy and clear migration, signal fences upon request
> > > > completion to ensure that we have a reliable perpetuation of the
> > > > operation outcome.
> > > > 
> > > > Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
> > > > Reported-by: Matthew Auld <matthew.auld@intel.com>
> > > > Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
> > > > Cc: stable@vger.kernel.org
> > > > Reviewed-by: Matthew Auld <matthew.auld@intel.com>
> > > With  Matt's comment regarding missing lock in intel_context_migrate_clear
> > > addressed, this is:
> > > 
> > > Acked-by: Nirmoy Das <nirmoy.das@intel.com>
> > 
> > Nack!
> > 
> > Please get some ack from Joonas or Tvrtko before merging this series.
> 
> There is no architectural change... of course, Joonas and Tvrtko
> are more than welcome (and actually invited) to look into this
> patch.
> 
> And, btw, there are still some discussions ongoing on this whole
> series, so that I'm not going to merge it any time soon. I'm just
> happy to revive the discussion.
> 
> > It is a big series targeting stable o.O where the revisions in the cover
> > letter are not helping me to be confident that this is the right approach
> > instead of simply reverting the original offending commit:
> > 
> > cf586021642d ("drm/i915/gt: Pipelined page migration")
> 
> Why should we remove all the migration completely? What about the
> copy?

Is there any other alternative that doesn't hurt the Linux stable rules?

I honestly fail to see this one here is "obviously corrected and tested"
and it looks to me that it has more "than 100 lines, with context".

Does this series really "fix only one thing" with 5 patches?

> 
> > It looks to me that we are adding magic on top of magic to workaround
> > the deadlocks, but then adding more waits inside locks... And this with
> > the hang checks vs heartbeats, is this really an issue on current upstream
> > code? or was only on DII?
> 
> There is no real magic happening here. It's just that the error
> message was not reaching the end of the operation while this
> patch is passing it over.
> 
> > Where was the bug report to start with?
> 
> Matt has reported this, I will give to you the necessary links to
> it offline.

It would be really good to have a report to see if this is
"real bug that bothers people (not a, “This could be a problem…” type thing)."

All quotes above are from:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html

> 
> Thanks for looking into this,
> Andi
Tvrtko Ursulin April 13, 2023, 11:25 a.m. UTC | #6
On 12/04/2023 14:10, Rodrigo Vivi wrote:
> On Wed, Apr 12, 2023 at 12:56:26PM +0200, Andi Shyti wrote:
>> Hi Rodrigo,
>>
>>>>> Currently, when we perform operations such as clearing or copying
>>>>> large blocks of memory, we generate multiple requests that are
>>>>> executed in a chain.
>>>>>
>>>>> However, if one of these requests fails, we may not realize it
>>>>> unless it happens to be the last request in the chain. This is
>>>>> because errors are not properly propagated.
>>>>>
>>>>> For this we need to keep propagating the chain of fence
>>>>> notification in order to always reach the final fence associated
>>>>> to the final request.
>>>>>
>>>>> To address this issue, we need to ensure that the chain of fence
>>>>> notifications is always propagated so that we can reach the final
>>>>> fence associated with the last request. By doing so, we will be
>>>>> able to detect any memory operation  failures and determine
>>>>> whether the memory is still invalid.
>>>>>
>>>>> On copy and clear migration signal fences upon completion.
>>>>>
>>>>> On copy and clear migration, signal fences upon request
>>>>> completion to ensure that we have a reliable perpetuation of the
>>>>> operation outcome.
>>>>>
>>>>> Fixes: cf586021642d80 ("drm/i915/gt: Pipelined page migration")
>>>>> Reported-by: Matthew Auld <matthew.auld@intel.com>
>>>>> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
>>>>> Cc: stable@vger.kernel.org

Try to find from which kernel version this needs to go in. For instance 
if we look at cf586021642d80 it would be v5.15+, but actually in that 
commit there are no users apart from selftests. So I think find the 
first user which can be user facing and mark the appropriate kernel 
version in the stable tag.

>>>>> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>>> With  Matt's comment regarding missing lock in intel_context_migrate_clear
>>>> addressed, this is:
>>>>
>>>> Acked-by: Nirmoy Das <nirmoy.das@intel.com>
>>>
>>> Nack!
>>>
>>> Please get some ack from Joonas or Tvrtko before merging this series.
>>
>> There is no architectural change... of course, Joonas and Tvrtko
>> are more than welcome (and actually invited) to look into this
>> patch.
>>
>> And, btw, there are still some discussions ongoing on this whole
>> series, so that I'm not going to merge it any time soon. I'm just
>> happy to revive the discussion.
>>
>>> It is a big series targeting stable o.O where the revisions in the cover
>>> letter are not helping me to be confident that this is the right approach
>>> instead of simply reverting the original offending commit:
>>>
>>> cf586021642d ("drm/i915/gt: Pipelined page migration")
>>
>> Why should we remove all the migration completely? What about the
>> copy?
> 
> Is there any other alternative that doesn't hurt the Linux stable rules?
> 
> I honestly fail to see this one here is "obviously corrected and tested"
> and it looks to me that it has more "than 100 lines, with context".
> 
> Does this series really "fix only one thing" with 5 patches?

This is challenging.

Fix to me looks needed on the high level (haven't read the patch details 
yet), but when a series sent to stable can go quite badly and we had 
such problem very recently with only a two patch series. And it is also 
indeed quite large.

Reverting cf586021642d80 definitely isn't an option because stuff 
depends on the code added by it and would need an alternative 
implementation. Losing async clear/migrate which would be bad and could 
also a large patch to implement the alternative.

So since I think we are indeed stuck with fixing this - would it be 
better to squash it all into one patch for easier backporting?

We can also look if there are ways to make the diff smaller.

Regards,

Tvrtko

>>> It looks to me that we are adding magic on top of magic to workaround
>>> the deadlocks, but then adding more waits inside locks... And this with
>>> the hang checks vs heartbeats, is this really an issue on current upstream
>>> code? or was only on DII?
>>
>> There is no real magic happening here. It's just that the error
>> message was not reaching the end of the operation while this
>> patch is passing it over.
>>
>>> Where was the bug report to start with?
>>
>> Matt has reported this, I will give to you the necessary links to
>> it offline.
> 
> It would be really good to have a report to see if this is
> "real bug that bothers people (not a, “This could be a problem…” type thing)."
> 
> All quotes above are from:
> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> 
>>
>> Thanks for looking into this,
>> Andi
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 3f638f1987968..0031e7b1b4704 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -742,13 +742,19 @@  intel_context_migrate_copy(struct intel_context *ce,
 			dst_offset = 2 * CHUNK_SZ;
 	}
 
+	/*
+	 * While building the chain of requests, we need to ensure
+	 * that no one can sneak into the timeline unnoticed.
+	 */
+	mutex_lock(&ce->timeline->mutex);
+
 	do {
 		int len;
 
-		rq = i915_request_create(ce);
+		rq = i915_request_create_locked(ce);
 		if (IS_ERR(rq)) {
 			err = PTR_ERR(rq);
-			goto out_ce;
+			break;
 		}
 
 		if (deps) {
@@ -878,10 +884,14 @@  intel_context_migrate_copy(struct intel_context *ce,
 
 		/* Arbitration is re-enabled between requests. */
 out_rq:
-		if (*out)
+		i915_sw_fence_await(&rq->submit);
+		i915_request_get(rq);
+		i915_request_add_locked(rq);
+		if (*out) {
+			i915_sw_fence_complete(&(*out)->submit);
 			i915_request_put(*out);
-		*out = i915_request_get(rq);
-		i915_request_add(rq);
+		}
+		*out = rq;
 
 		if (err)
 			break;
@@ -905,7 +915,10 @@  intel_context_migrate_copy(struct intel_context *ce,
 		cond_resched();
 	} while (1);
 
-out_ce:
+	mutex_unlock(&ce->timeline->mutex);
+
+	if (*out)
+		i915_sw_fence_complete(&(*out)->submit);
 	return err;
 }
 
@@ -1005,7 +1018,7 @@  intel_context_migrate_clear(struct intel_context *ce,
 		rq = i915_request_create(ce);
 		if (IS_ERR(rq)) {
 			err = PTR_ERR(rq);
-			goto out_ce;
+			break;
 		}
 
 		if (deps) {
@@ -1056,17 +1069,23 @@  intel_context_migrate_clear(struct intel_context *ce,
 
 		/* Arbitration is re-enabled between requests. */
 out_rq:
-		if (*out)
-			i915_request_put(*out);
-		*out = i915_request_get(rq);
+		i915_sw_fence_await(&rq->submit);
+		i915_request_get(rq);
 		i915_request_add(rq);
+		if (*out) {
+			i915_sw_fence_complete(&(*out)->submit);
+			i915_request_put(*out);
+		}
+		*out = rq;
+
 		if (err || !it.sg || !sg_dma_len(it.sg))
 			break;
 
 		cond_resched();
 	} while (1);
 
-out_ce:
+	if (*out)
+		i915_sw_fence_complete(&(*out)->submit);
 	return err;
 }