diff mbox series

[v2] drm/i915/gt: Hold a wakeref for the active VM

Message ID 20230330-hold_wakeref_for_active_vm-v2-1-724d201499c2@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915/gt: Hold a wakeref for the active VM | expand

Commit Message

Andrzej Hajda March 31, 2023, 2:16 p.m. UTC
From: Chris Wilson <chris@chris-wilson.co.uk>

There may be a disconnect between the GT used by the engine and the GT
used for the VM, requiring us to hold a wakeref on both while the GPU is
active with this request.

v2: added explanation to __queue_and_release_pm

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
[ahajda: removed not-yet-upstremed wakeref tracking bits]
Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com>
---
Changes in v2:
- Link to v1: https://lore.kernel.org/r/20230330-hold_wakeref_for_active_vm-v1-1-baca712692f6@intel.com
---
 drivers/gpu/drm/i915/gt/intel_context.h   | 15 +++++++++++----
 drivers/gpu/drm/i915/gt/intel_engine_pm.c |  9 +++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)


---
base-commit: 3385d6482cd60f2a0bbb0fa97b70ae7dbba4f95c
change-id: 20230330-hold_wakeref_for_active_vm-7f013a449ef3

Best regards,

Comments

Tvrtko Ursulin March 31, 2023, 2:51 p.m. UTC | #1
On 31/03/2023 15:16, Andrzej Hajda wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> There may be a disconnect between the GT used by the engine and the GT
> used for the VM, requiring us to hold a wakeref on both while the GPU is
> active with this request.
> 
> v2: added explanation to __queue_and_release_pm
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> [ahajda: removed not-yet-upstremed wakeref tracking bits]
> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com>
> ---
> Changes in v2:
> - Link to v1: https://lore.kernel.org/r/20230330-hold_wakeref_for_active_vm-v1-1-baca712692f6@intel.com
> ---
>   drivers/gpu/drm/i915/gt/intel_context.h   | 15 +++++++++++----
>   drivers/gpu/drm/i915/gt/intel_engine_pm.c |  9 +++++++++
>   2 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index 0a8d553da3f439..48f888c3da083b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -14,6 +14,7 @@
>   #include "i915_drv.h"
>   #include "intel_context_types.h"
>   #include "intel_engine_types.h"
> +#include "intel_gt_pm.h"
>   #include "intel_ring_types.h"
>   #include "intel_timeline_types.h"
>   #include "i915_trace.h"
> @@ -207,8 +208,11 @@ void intel_context_exit_engine(struct intel_context *ce);
>   static inline void intel_context_enter(struct intel_context *ce)
>   {
>   	lockdep_assert_held(&ce->timeline->mutex);
> -	if (!ce->active_count++)
> -		ce->ops->enter(ce);
> +	if (ce->active_count++)
> +		return;
> +
> +	ce->ops->enter(ce);
> +	intel_gt_pm_get(ce->vm->gt);
>   }
>   
>   static inline void intel_context_mark_active(struct intel_context *ce)
> @@ -222,8 +226,11 @@ static inline void intel_context_exit(struct intel_context *ce)
>   {
>   	lockdep_assert_held(&ce->timeline->mutex);
>   	GEM_BUG_ON(!ce->active_count);
> -	if (!--ce->active_count)
> -		ce->ops->exit(ce);
> +	if (--ce->active_count)
> +		return;
> +
> +	intel_gt_pm_put_async(ce->vm->gt);
> +	ce->ops->exit(ce);
>   }
>   
>   static inline struct intel_context *intel_context_get(struct intel_context *ce)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index e971b153fda976..ee531a5c142c77 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -114,6 +114,15 @@ __queue_and_release_pm(struct i915_request *rq,
>   
>   	ENGINE_TRACE(engine, "parking\n");
>   
> +	/*
> +	 * Open coded one half of intel_context_enter, which we have to omit
> +	 * here (see the large comment below) and because the other part must
> +	 * not be called due constructing directly with __i915_request_create
> +	 * which increments active count via intel_context_mark_active.
> +	 */
> +	GEM_BUG_ON(rq->context->active_count != 1);
> +	__intel_gt_pm_get(engine->gt);
> +
>   	/*
>   	 * We have to serialise all potential retirement paths with our
>   	 * submission, as we don't want to underflow either the

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Andrzej Hajda April 3, 2023, 8:58 a.m. UTC | #2
On 01.04.2023 20:02, Patchwork wrote:
> Project List - Patchwork *Patch Details*
> *Series:* 	drm/i915/gt: Hold a wakeref for the active VM (rev2)
> *URL:* 	https://patchwork.freedesktop.org/series/115873/
> *State:* 	failure
> *Details:* 
> https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/index.html
>
>
>   CI Bug Log - changes from CI_DRM_12951_full -> Patchwork_115873v2_full
>
>
>     Summary
>
> *FAILURE*
>
> Serious unknown changes coming with Patchwork_115873v2_full absolutely 
> need to be
> verified manually.
>
> If you think the reported changes have nothing to do with the changes
> introduced in Patchwork_115873v2_full, please notify your bug team to 
> allow them
> to document this new failure mode, which will reduce false positives 
> in CI.
>
>
>     Participating hosts (7 -> 7)
>
> No changes in participating hosts
>
>
>     Possible new issues
>
> Here are the unknown changes that may have been introduced in 
> Patchwork_115873v2_full:
>
>
>       IGT changes
>
>
>         Possible regressions
>
>   * igt@gem_exec_fence@syncobj-repeat:
>       o shard-glk: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk5/igt@gem_exec_fence@syncobj-repeat.html>
>         -> TIMEOUT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk9/igt@gem_exec_fence@syncobj-repeat.html>
>         +2 similar issues
>

The actual test did not even start, because of failure on previous tests.
All tests since igt@perf@enable-disable@0-rcs0 timeouts on this machine[1].
Since igt@perf@enable-disable is untrusted/suppressed I guess it is not 
related.
Moreover v1(which differs only by comment) passed.
Summarizing apparently known bug in igt@perf@subtests [2].

[1]: 
http://gfx-ci.igk.intel.com/tree/drm-tip/Patchwork_115873v2/shard-glk9/igt_runner13.txt
[2]: https://gitlab.freedesktop.org/drm/intel/-/issues/5213

Regards
Andrzej

>  *
>
>
>         Warnings
>
>   * igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-shrfb-draw-pwrite:
>
>       o shard-glk: SKIP
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk5/igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-shrfb-draw-pwrite.html>
>         (fdo#109271
>         <https://bugs.freedesktop.org/show_bug.cgi?id=109271>) ->
>         TIMEOUT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk9/igt@kms_frontbuffer_tracking@fbcpsr-1p-offscren-pri-shrfb-draw-pwrite.html>
>         +2 similar issues
>
>
>         Suppressed
>
> The following results come from untrusted machines, tests, or statuses.
> They do not affect the overall result.
>
>   * {igt@perf@enable-disable@0-rcs0}:
>       o shard-glk: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk5/igt@perf@enable-disable@0-rcs0.html>
>         -> TIMEOUT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk9/igt@perf@enable-disable@0-rcs0.html>
>
>
>     Known issues
>
> Here are the changes found in Patchwork_115873v2_full that come from 
> known issues:
>
>
>       IGT changes
>
>
>         Issues hit
>
>  *
>
>     igt@gem_exec_fair@basic-deadline:
>
>       o shard-glk: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk9/igt@gem_exec_fair@basic-deadline.html>
>         -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk8/igt@gem_exec_fair@basic-deadline.html>
>         (i915#2846 <https://gitlab.freedesktop.org/drm/intel/issues/2846>)
>  *
>
>     igt@gen9_exec_parse@allowed-single:
>
>       o shard-apl: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-apl7/igt@gen9_exec_parse@allowed-single.html>
>         -> ABORT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-apl3/igt@gen9_exec_parse@allowed-single.html>
>         (i915#5566 <https://gitlab.freedesktop.org/drm/intel/issues/5566>)
>  *
>
>     igt@i915_selftest@live@gt_heartbeat:
>
>       o shard-apl: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-apl1/igt@i915_selftest@live@gt_heartbeat.html>
>         -> DMESG-FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-apl2/igt@i915_selftest@live@gt_heartbeat.html>
>         (i915#5334 <https://gitlab.freedesktop.org/drm/intel/issues/5334>)
>  *
>
>     igt@kms_async_flips@alternate-sync-async-flip@pipe-a-vga-1:
>
>       o shard-snb: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-snb5/igt@kms_async_flips@alternate-sync-async-flip@pipe-a-vga-1.html>
>         -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-snb2/igt@kms_async_flips@alternate-sync-async-flip@pipe-a-vga-1.html>
>         (i915#2521 <https://gitlab.freedesktop.org/drm/intel/issues/2521>)
>  *
>
>     igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
>
>       o shard-glk: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk1/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html>
>         -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk2/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html>
>         (i915#2346 <https://gitlab.freedesktop.org/drm/intel/issues/2346>)
>  *
>
>     igt@kms_flip@flip-vs-expired-vblank@b-dp1:
>
>       o shard-apl: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-apl4/igt@kms_flip@flip-vs-expired-vblank@b-dp1.html>
>         -> FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-apl7/igt@kms_flip@flip-vs-expired-vblank@b-dp1.html>
>         (i915#79 <https://gitlab.freedesktop.org/drm/intel/issues/79>)
>  *
>
>     igt@kms_plane@plane-panning-bottom-right-suspend@pipe-a-planes:
>
>       o shard-apl: PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-apl7/igt@kms_plane@plane-panning-bottom-right-suspend@pipe-a-planes.html>
>         -> ABORT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-apl3/igt@kms_plane@plane-panning-bottom-right-suspend@pipe-a-planes.html>
>         (i915#180 <https://gitlab.freedesktop.org/drm/intel/issues/180>)
>
>
>         Possible fixes
>
>  *
>
>     igt@gem_ctx_exec@basic-nohangcheck:
>
>       o {shard-tglu}: FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-tglu-2/igt@gem_ctx_exec@basic-nohangcheck.html>
>         (i915#6268
>         <https://gitlab.freedesktop.org/drm/intel/issues/6268>) ->
>         PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-tglu-10/igt@gem_ctx_exec@basic-nohangcheck.html>
>  *
>
>     igt@gem_eio@in-flight-contexts-1us:
>
>       o {shard-tglu}: TIMEOUT
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-tglu-7/igt@gem_eio@in-flight-contexts-1us.html>
>         (i915#3063
>         <https://gitlab.freedesktop.org/drm/intel/issues/3063>) ->
>         PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-tglu-3/igt@gem_eio@in-flight-contexts-1us.html>
>  *
>
>     igt@gem_eio@reset-stress:
>
>       o {shard-dg1}: FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-dg1-14/igt@gem_eio@reset-stress.html>
>         (i915#5784
>         <https://gitlab.freedesktop.org/drm/intel/issues/5784>) ->
>         PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-dg1-15/igt@gem_eio@reset-stress.html>
>  *
>
>     igt@kms_cursor_legacy@2x-long-flip-vs-cursor-atomic:
>
>       o shard-glk: FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk1/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-atomic.html>
>         (i915#72 <https://gitlab.freedesktop.org/drm/intel/issues/72>)
>         -> PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk2/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-atomic.html>
>  *
>
>     igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size:
>
>      o
>
>         shard-apl: FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-apl2/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html>
>         (i915#2346
>         <https://gitlab.freedesktop.org/drm/intel/issues/2346>) ->
>         PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-apl6/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html>
>
>      o
>
>         shard-glk: FAIL
>         <https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_12951/shard-glk2/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html>
>         (i915#2346
>         <https://gitlab.freedesktop.org/drm/intel/issues/2346>) ->
>         PASS
>         <https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_115873v2/shard-glk4/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions-varying-size.html>
>
> {name}: This element is suppressed. This means it is ignored when 
> computing
> the status of the difference (SUCCESS, WARNING, or FAILURE).
>
>
>     Build changes
>
>   * Linux: CI_DRM_12951 -> Patchwork_115873v2
>
> CI-20190529: 20190529
> CI_DRM_12951: f128906b94b25a0f0c12dc8c647b8adc8d934d8c @ 
> git://anongit.freedesktop.org/gfx-ci/linux
> IGT_7230: f0485204004305dd3ee8f8bbbb9c552e53a4e050 @ 
> https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
> Patchwork_115873v2: f128906b94b25a0f0c12dc8c647b8adc8d934d8c @ 
> git://anongit.freedesktop.org/gfx-ci/linux
> piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ 
> git://anongit.freedesktop.org/piglit
>
Andi Shyti April 4, 2023, 3:39 p.m. UTC | #3
Hi Andrzej,

> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> index 0a8d553da3f439..48f888c3da083b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_context.h
> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> @@ -14,6 +14,7 @@
>  #include "i915_drv.h"
>  #include "intel_context_types.h"
>  #include "intel_engine_types.h"
> +#include "intel_gt_pm.h"
>  #include "intel_ring_types.h"
>  #include "intel_timeline_types.h"
>  #include "i915_trace.h"
> @@ -207,8 +208,11 @@ void intel_context_exit_engine(struct intel_context *ce);
>  static inline void intel_context_enter(struct intel_context *ce)
>  {
>  	lockdep_assert_held(&ce->timeline->mutex);
> -	if (!ce->active_count++)
> -		ce->ops->enter(ce);
> +	if (ce->active_count++)
> +		return;
> +
> +	ce->ops->enter(ce);
> +	intel_gt_pm_get(ce->vm->gt);
>  }
>  
>  static inline void intel_context_mark_active(struct intel_context *ce)
> @@ -222,8 +226,11 @@ static inline void intel_context_exit(struct intel_context *ce)
>  {
>  	lockdep_assert_held(&ce->timeline->mutex);
>  	GEM_BUG_ON(!ce->active_count);
> -	if (!--ce->active_count)
> -		ce->ops->exit(ce);
> +	if (--ce->active_count)
> +		return;
> +
> +	intel_gt_pm_put_async(ce->vm->gt);
> +	ce->ops->exit(ce);

shouldn't these two be swapped?

>  }
>  
>  static inline struct intel_context *intel_context_get(struct intel_context *ce)
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> index e971b153fda976..ee531a5c142c77 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> @@ -114,6 +114,15 @@ __queue_and_release_pm(struct i915_request *rq,
>  
>  	ENGINE_TRACE(engine, "parking\n");
>  
> +	/*
> +	 * Open coded one half of intel_context_enter, which we have to omit
> +	 * here (see the large comment below) and because the other part must
> +	 * not be called due constructing directly with __i915_request_create
> +	 * which increments active count via intel_context_mark_active.
> +	 */
> +	GEM_BUG_ON(rq->context->active_count != 1);
> +	__intel_gt_pm_get(engine->gt);

where is it's brother "put"?

Thanks,
Andi

> +
>  	/*
>  	 * We have to serialise all potential retirement paths with our
>  	 * submission, as we don't want to underflow either the
> 
> ---
> base-commit: 3385d6482cd60f2a0bbb0fa97b70ae7dbba4f95c
> change-id: 20230330-hold_wakeref_for_active_vm-7f013a449ef3
> 
> Best regards,
> -- 
> Andrzej Hajda <andrzej.hajda@intel.com>
Tvrtko Ursulin April 4, 2023, 3:57 p.m. UTC | #4
On 04/04/2023 16:39, Andi Shyti wrote:
> Hi Andrzej,
> 
>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
>> index 0a8d553da3f439..48f888c3da083b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>> @@ -14,6 +14,7 @@
>>   #include "i915_drv.h"
>>   #include "intel_context_types.h"
>>   #include "intel_engine_types.h"
>> +#include "intel_gt_pm.h"
>>   #include "intel_ring_types.h"
>>   #include "intel_timeline_types.h"
>>   #include "i915_trace.h"
>> @@ -207,8 +208,11 @@ void intel_context_exit_engine(struct intel_context *ce);
>>   static inline void intel_context_enter(struct intel_context *ce)
>>   {
>>   	lockdep_assert_held(&ce->timeline->mutex);
>> -	if (!ce->active_count++)
>> -		ce->ops->enter(ce);
>> +	if (ce->active_count++)
>> +		return;
>> +
>> +	ce->ops->enter(ce);
>> +	intel_gt_pm_get(ce->vm->gt);
>>   }
>>   
>>   static inline void intel_context_mark_active(struct intel_context *ce)
>> @@ -222,8 +226,11 @@ static inline void intel_context_exit(struct intel_context *ce)
>>   {
>>   	lockdep_assert_held(&ce->timeline->mutex);
>>   	GEM_BUG_ON(!ce->active_count);
>> -	if (!--ce->active_count)
>> -		ce->ops->exit(ce);
>> +	if (--ce->active_count)
>> +		return;
>> +
>> +	intel_gt_pm_put_async(ce->vm->gt);
>> +	ce->ops->exit(ce);
> 
> shouldn't these two be swapped?
> 
>>   }
>>   
>>   static inline struct intel_context *intel_context_get(struct intel_context *ce)
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>> index e971b153fda976..ee531a5c142c77 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>> @@ -114,6 +114,15 @@ __queue_and_release_pm(struct i915_request *rq,
>>   
>>   	ENGINE_TRACE(engine, "parking\n");
>>   
>> +	/*
>> +	 * Open coded one half of intel_context_enter, which we have to omit
>> +	 * here (see the large comment below) and because the other part must
>> +	 * not be called due constructing directly with __i915_request_create
>> +	 * which increments active count via intel_context_mark_active.
>> +	 */
>> +	GEM_BUG_ON(rq->context->active_count != 1);
>> +	__intel_gt_pm_get(engine->gt);
> 
> where is it's brother "put"?

It's in request retire via intel_context_exit. Ie. request construction 
is special here, while retirement is standard.

Regards,

Tvrtko

> 
> Thanks,
> Andi
> 
>> +
>>   	/*
>>   	 * We have to serialise all potential retirement paths with our
>>   	 * submission, as we don't want to underflow either the
>>
>> ---
>> base-commit: 3385d6482cd60f2a0bbb0fa97b70ae7dbba4f95c
>> change-id: 20230330-hold_wakeref_for_active_vm-7f013a449ef3
>>
>> Best regards,
>> -- 
>> Andrzej Hajda <andrzej.hajda@intel.com>
Andi Shyti April 4, 2023, 4 p.m. UTC | #5
Hi Tvrtko,

> > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
> > > index 0a8d553da3f439..48f888c3da083b 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_context.h
> > > +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> > > @@ -14,6 +14,7 @@
> > >   #include "i915_drv.h"
> > >   #include "intel_context_types.h"
> > >   #include "intel_engine_types.h"
> > > +#include "intel_gt_pm.h"
> > >   #include "intel_ring_types.h"
> > >   #include "intel_timeline_types.h"
> > >   #include "i915_trace.h"
> > > @@ -207,8 +208,11 @@ void intel_context_exit_engine(struct intel_context *ce);
> > >   static inline void intel_context_enter(struct intel_context *ce)
> > >   {
> > >   	lockdep_assert_held(&ce->timeline->mutex);
> > > -	if (!ce->active_count++)
> > > -		ce->ops->enter(ce);
> > > +	if (ce->active_count++)
> > > +		return;
> > > +
> > > +	ce->ops->enter(ce);
> > > +	intel_gt_pm_get(ce->vm->gt);
> > >   }
> > >   static inline void intel_context_mark_active(struct intel_context *ce)
> > > @@ -222,8 +226,11 @@ static inline void intel_context_exit(struct intel_context *ce)
> > >   {
> > >   	lockdep_assert_held(&ce->timeline->mutex);
> > >   	GEM_BUG_ON(!ce->active_count);
> > > -	if (!--ce->active_count)
> > > -		ce->ops->exit(ce);
> > > +	if (--ce->active_count)
> > > +		return;
> > > +
> > > +	intel_gt_pm_put_async(ce->vm->gt);
> > > +	ce->ops->exit(ce);
> > 
> > shouldn't these two be swapped?

maybe I wasn't clear here... shouldn't it be

	ce->ops->exit(ce);
	intel_gt_pm_put_async(ce->vm->gt);

Don't we need to hold the pm until exiting?

> > >   }
> > >   static inline struct intel_context *intel_context_get(struct intel_context *ce)
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > index e971b153fda976..ee531a5c142c77 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
> > > @@ -114,6 +114,15 @@ __queue_and_release_pm(struct i915_request *rq,
> > >   	ENGINE_TRACE(engine, "parking\n");
> > > +	/*
> > > +	 * Open coded one half of intel_context_enter, which we have to omit
> > > +	 * here (see the large comment below) and because the other part must
> > > +	 * not be called due constructing directly with __i915_request_create
> > > +	 * which increments active count via intel_context_mark_active.
> > > +	 */
> > > +	GEM_BUG_ON(rq->context->active_count != 1);
> > > +	__intel_gt_pm_get(engine->gt);
> > 
> > where is it's brother "put"?
> 
> It's in request retire via intel_context_exit. Ie. request construction is
> special here, while retirement is standard.

Thank you!
Andi
Tvrtko Ursulin April 4, 2023, 4:22 p.m. UTC | #6
On 04/04/2023 17:00, Andi Shyti wrote:
> Hi Tvrtko,
> 
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
>>>> index 0a8d553da3f439..48f888c3da083b 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>> @@ -14,6 +14,7 @@
>>>>    #include "i915_drv.h"
>>>>    #include "intel_context_types.h"
>>>>    #include "intel_engine_types.h"
>>>> +#include "intel_gt_pm.h"
>>>>    #include "intel_ring_types.h"
>>>>    #include "intel_timeline_types.h"
>>>>    #include "i915_trace.h"
>>>> @@ -207,8 +208,11 @@ void intel_context_exit_engine(struct intel_context *ce);
>>>>    static inline void intel_context_enter(struct intel_context *ce)
>>>>    {
>>>>    	lockdep_assert_held(&ce->timeline->mutex);
>>>> -	if (!ce->active_count++)
>>>> -		ce->ops->enter(ce);
>>>> +	if (ce->active_count++)
>>>> +		return;
>>>> +
>>>> +	ce->ops->enter(ce);
>>>> +	intel_gt_pm_get(ce->vm->gt);
>>>>    }
>>>>    static inline void intel_context_mark_active(struct intel_context *ce)
>>>> @@ -222,8 +226,11 @@ static inline void intel_context_exit(struct intel_context *ce)
>>>>    {
>>>>    	lockdep_assert_held(&ce->timeline->mutex);
>>>>    	GEM_BUG_ON(!ce->active_count);
>>>> -	if (!--ce->active_count)
>>>> -		ce->ops->exit(ce);
>>>> +	if (--ce->active_count)
>>>> +		return;
>>>> +
>>>> +	intel_gt_pm_put_async(ce->vm->gt);
>>>> +	ce->ops->exit(ce);
>>>
>>> shouldn't these two be swapped?
> 
> maybe I wasn't clear here... shouldn't it be

I missed this one.

> 	ce->ops->exit(ce);
> 	intel_gt_pm_put_async(ce->vm->gt);
> 
> Don't we need to hold the pm until exiting?

I think it doesn't matter. The problematic edge case this is fixing is 
when ce->engine->gt is different from ce->vm->gt but at this point if it 
is safe to release one it must be safe to release the other too.

Regards,

Tvrtko


> 
>>>>    }
>>>>    static inline struct intel_context *intel_context_get(struct intel_context *ce)
>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>> index e971b153fda976..ee531a5c142c77 100644
>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
>>>> @@ -114,6 +114,15 @@ __queue_and_release_pm(struct i915_request *rq,
>>>>    	ENGINE_TRACE(engine, "parking\n");
>>>> +	/*
>>>> +	 * Open coded one half of intel_context_enter, which we have to omit
>>>> +	 * here (see the large comment below) and because the other part must
>>>> +	 * not be called due constructing directly with __i915_request_create
>>>> +	 * which increments active count via intel_context_mark_active.
>>>> +	 */
>>>> +	GEM_BUG_ON(rq->context->active_count != 1);
>>>> +	__intel_gt_pm_get(engine->gt);
>>>
>>> where is it's brother "put"?
>>
>> It's in request retire via intel_context_exit. Ie. request construction is
>> special here, while retirement is standard.
> 
> Thank you!
> Andi
Andi Shyti April 4, 2023, 4:29 p.m. UTC | #7
Hi,

On Fri, Mar 31, 2023 at 04:16:36PM +0200, Andrzej Hajda wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> There may be a disconnect between the GT used by the engine and the GT
> used for the VM, requiring us to hold a wakeref on both while the GPU is
> active with this request.
> 
> v2: added explanation to __queue_and_release_pm
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> [ahajda: removed not-yet-upstremed wakeref tracking bits]
> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com>

Thank you Tvrtko and Chris for answering my questions,

Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com> 

Andi
Andrzej Hajda April 5, 2023, 2:16 p.m. UTC | #8
On 31.03.2023 16:16, Andrzej Hajda wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> There may be a disconnect between the GT used by the engine and the GT
> used for the VM, requiring us to hold a wakeref on both while the GPU is
> active with this request.
> 
> v2: added explanation to __queue_and_release_pm
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> [ahajda: removed not-yet-upstremed wakeref tracking bits]
> Signed-off-by: Andrzej Hajda <andrzej.hajda@intel.com>


Queued.

Regards
Andrzej
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h
index 0a8d553da3f439..48f888c3da083b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.h
+++ b/drivers/gpu/drm/i915/gt/intel_context.h
@@ -14,6 +14,7 @@ 
 #include "i915_drv.h"
 #include "intel_context_types.h"
 #include "intel_engine_types.h"
+#include "intel_gt_pm.h"
 #include "intel_ring_types.h"
 #include "intel_timeline_types.h"
 #include "i915_trace.h"
@@ -207,8 +208,11 @@  void intel_context_exit_engine(struct intel_context *ce);
 static inline void intel_context_enter(struct intel_context *ce)
 {
 	lockdep_assert_held(&ce->timeline->mutex);
-	if (!ce->active_count++)
-		ce->ops->enter(ce);
+	if (ce->active_count++)
+		return;
+
+	ce->ops->enter(ce);
+	intel_gt_pm_get(ce->vm->gt);
 }
 
 static inline void intel_context_mark_active(struct intel_context *ce)
@@ -222,8 +226,11 @@  static inline void intel_context_exit(struct intel_context *ce)
 {
 	lockdep_assert_held(&ce->timeline->mutex);
 	GEM_BUG_ON(!ce->active_count);
-	if (!--ce->active_count)
-		ce->ops->exit(ce);
+	if (--ce->active_count)
+		return;
+
+	intel_gt_pm_put_async(ce->vm->gt);
+	ce->ops->exit(ce);
 }
 
 static inline struct intel_context *intel_context_get(struct intel_context *ce)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
index e971b153fda976..ee531a5c142c77 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c
@@ -114,6 +114,15 @@  __queue_and_release_pm(struct i915_request *rq,
 
 	ENGINE_TRACE(engine, "parking\n");
 
+	/*
+	 * Open coded one half of intel_context_enter, which we have to omit
+	 * here (see the large comment below) and because the other part must
+	 * not be called due constructing directly with __i915_request_create
+	 * which increments active count via intel_context_mark_active.
+	 */
+	GEM_BUG_ON(rq->context->active_count != 1);
+	__intel_gt_pm_get(engine->gt);
+
 	/*
 	 * We have to serialise all potential retirement paths with our
 	 * submission, as we don't want to underflow either the