diff mbox series

[2/2] drm/amdgpu: remove all KFD fences from the BO on release

Message ID 20250129152849.15777-2-christian.koenig@amd.com (mailing list archive)
State New
Headers show
Series [1/2] drm/ttm: test private resv obj on release/destroy | expand

Commit Message

Christian König Jan. 29, 2025, 3:28 p.m. UTC
Remove all KFD BOs from the private dma_resv object.

This prevents the KFD from being evitec unecessarily when an exported BO
is released.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 52 ++++++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    | 38 ++++++++------
 3 files changed, 47 insertions(+), 48 deletions(-)

Comments

Felix Kuehling Jan. 29, 2025, 4:54 p.m. UTC | #1
On 2025-01-29 10:28, Christian König wrote:
> Remove all KFD BOs from the private dma_resv object.
> 
> This prevents the KFD from being evitec unecessarily when an exported BO
> is released.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>

Looks good to me. Assuming James doesn't find any issues in testing, the series is

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  5 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 52 ++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c    | 38 ++++++++------
>  3 files changed, 47 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 4b80ad860639..62917f76da33 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -192,7 +192,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
>  #if IS_ENABLED(CONFIG_HSA_AMD)
>  bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
>  struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
> -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
> +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo);
>  int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
>  				unsigned long cur_seq, struct kgd_mem *mem);
>  int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
> @@ -212,9 +212,8 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
>  }
>  
>  static inline
> -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
> +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
>  {
> -	return 0;
>  }
>  
>  static inline
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index f30548f4c3b3..609b27fe1cda 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -370,40 +370,32 @@ static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
>  	return 0;
>  }
>  
> -int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
> +/**
> + * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences
> + * @bo: the BO where to remove the evictions fences from.
> + *
> + * This functions should only be used on release when all references to the BO
> + * are already dropped. We remove the eviction fence from the private copy of
> + * the dma_resv object here since that is what is used during release to
> + * determine of the BO is idle or not.
> + */
> +void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
>  {
> -	struct amdgpu_bo *root = bo;
> -	struct amdgpu_vm_bo_base *vm_bo;
> -	struct amdgpu_vm *vm;
> -	struct amdkfd_process_info *info;
> -	struct amdgpu_amdkfd_fence *ef;
> -	int ret;
> -
> -	/* we can always get vm_bo from root PD bo.*/
> -	while (root->parent)
> -		root = root->parent;
> +	struct dma_resv *resv = &bo->tbo.base._resv;
> +	struct dma_fence *fence, *stub;
> +	struct dma_resv_iter cursor;
>  
> -	vm_bo = root->vm_bo;
> -	if (!vm_bo)
> -		return 0;
> +	dma_resv_assert_held(resv);
>  
> -	vm = vm_bo->vm;
> -	if (!vm)
> -		return 0;
> -
> -	info = vm->process_info;
> -	if (!info || !info->eviction_fence)
> -		return 0;
> -
> -	ef = container_of(dma_fence_get(&info->eviction_fence->base),
> -			struct amdgpu_amdkfd_fence, base);
> -
> -	BUG_ON(!dma_resv_trylock(bo->tbo.base.resv));
> -	ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
> -	dma_resv_unlock(bo->tbo.base.resv);
> +	stub = dma_fence_get_stub();
> +	dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
> +		if (!to_amdgpu_amdkfd_fence(fence))
> +			continue;
>  
> -	dma_fence_put(&ef->base);
> -	return ret;
> +		dma_resv_replace_fences(resv, fence->context, stub,
> +					DMA_RESV_USAGE_BOOKKEEP);
> +	}
> +	dma_fence_put(stub);
>  }
>  
>  static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index fc94b8b9b86d..d12be7a1eb6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1194,28 +1194,36 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
>  	if (abo->kfd_bo)
>  		amdgpu_amdkfd_release_notify(abo);
>  
> -	/* We only remove the fence if the resv has individualized. */
> -	WARN_ON_ONCE(bo->type == ttm_bo_type_kernel
> -			&& bo->base.resv != &bo->base._resv);
> -	if (bo->base.resv == &bo->base._resv)
> -		amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
> +	/*
> +	 * We lock the private dma_resv object here and since the BO is about to
> +	 * be released nobody else should have a pointer to it.
> +	 * So when this locking here fails something is wrong with the reference
> +	 * counting.
> +	 */
> +	if (WARN_ON_ONCE(!dma_resv_trylock(&bo->base._resv)))
> +		return;
> +
> +	amdgpu_amdkfd_remove_all_eviction_fences(abo);
>  
>  	if (!bo->resource || bo->resource->mem_type != TTM_PL_VRAM ||
>  	    !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) ||
>  	    adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev)))
> -		return;
> +		goto out;
>  
> -	if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
> -		return;
> +	r = dma_resv_reserve_fences(&bo->base._resv, 1);
> +	if (r)
> +		goto out;
>  
> -	r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);
> -	if (!WARN_ON(r)) {
> -		amdgpu_vram_mgr_set_cleared(bo->resource);
> -		amdgpu_bo_fence(abo, fence, false);
> -		dma_fence_put(fence);
> -	}
> +	r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true);
> +	if (WARN_ON(r))
> +		goto out;
> +
> +	amdgpu_vram_mgr_set_cleared(bo->resource);
> +	dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL);
> +	dma_fence_put(fence);
>  
> -	dma_resv_unlock(bo->base.resv);
> +out:
> +	dma_resv_unlock(&bo->base._resv);
>  }
>  
>  /**
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4b80ad860639..62917f76da33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -192,7 +192,7 @@  int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
 #if IS_ENABLED(CONFIG_HSA_AMD)
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
 struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo);
 int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
 				unsigned long cur_seq, struct kgd_mem *mem);
 int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
@@ -212,9 +212,8 @@  struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
 }
 
 static inline
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
 {
-	return 0;
 }
 
 static inline
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f30548f4c3b3..609b27fe1cda 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -370,40 +370,32 @@  static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
 	return 0;
 }
 
-int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo)
+/**
+ * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences
+ * @bo: the BO where to remove the evictions fences from.
+ *
+ * This functions should only be used on release when all references to the BO
+ * are already dropped. We remove the eviction fence from the private copy of
+ * the dma_resv object here since that is what is used during release to
+ * determine of the BO is idle or not.
+ */
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
 {
-	struct amdgpu_bo *root = bo;
-	struct amdgpu_vm_bo_base *vm_bo;
-	struct amdgpu_vm *vm;
-	struct amdkfd_process_info *info;
-	struct amdgpu_amdkfd_fence *ef;
-	int ret;
-
-	/* we can always get vm_bo from root PD bo.*/
-	while (root->parent)
-		root = root->parent;
+	struct dma_resv *resv = &bo->tbo.base._resv;
+	struct dma_fence *fence, *stub;
+	struct dma_resv_iter cursor;
 
-	vm_bo = root->vm_bo;
-	if (!vm_bo)
-		return 0;
+	dma_resv_assert_held(resv);
 
-	vm = vm_bo->vm;
-	if (!vm)
-		return 0;
-
-	info = vm->process_info;
-	if (!info || !info->eviction_fence)
-		return 0;
-
-	ef = container_of(dma_fence_get(&info->eviction_fence->base),
-			struct amdgpu_amdkfd_fence, base);
-
-	BUG_ON(!dma_resv_trylock(bo->tbo.base.resv));
-	ret = amdgpu_amdkfd_remove_eviction_fence(bo, ef);
-	dma_resv_unlock(bo->tbo.base.resv);
+	stub = dma_fence_get_stub();
+	dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
+		if (!to_amdgpu_amdkfd_fence(fence))
+			continue;
 
-	dma_fence_put(&ef->base);
-	return ret;
+		dma_resv_replace_fences(resv, fence->context, stub,
+					DMA_RESV_USAGE_BOOKKEEP);
+	}
+	dma_fence_put(stub);
 }
 
 static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index fc94b8b9b86d..d12be7a1eb6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1194,28 +1194,36 @@  void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
 	if (abo->kfd_bo)
 		amdgpu_amdkfd_release_notify(abo);
 
-	/* We only remove the fence if the resv has individualized. */
-	WARN_ON_ONCE(bo->type == ttm_bo_type_kernel
-			&& bo->base.resv != &bo->base._resv);
-	if (bo->base.resv == &bo->base._resv)
-		amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
+	/*
+	 * We lock the private dma_resv object here and since the BO is about to
+	 * be released nobody else should have a pointer to it.
+	 * So when this locking here fails something is wrong with the reference
+	 * counting.
+	 */
+	if (WARN_ON_ONCE(!dma_resv_trylock(&bo->base._resv)))
+		return;
+
+	amdgpu_amdkfd_remove_all_eviction_fences(abo);
 
 	if (!bo->resource || bo->resource->mem_type != TTM_PL_VRAM ||
 	    !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) ||
 	    adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev)))
-		return;
+		goto out;
 
-	if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
-		return;
+	r = dma_resv_reserve_fences(&bo->base._resv, 1);
+	if (r)
+		goto out;
 
-	r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);
-	if (!WARN_ON(r)) {
-		amdgpu_vram_mgr_set_cleared(bo->resource);
-		amdgpu_bo_fence(abo, fence, false);
-		dma_fence_put(fence);
-	}
+	r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true);
+	if (WARN_ON(r))
+		goto out;
+
+	amdgpu_vram_mgr_set_cleared(bo->resource);
+	dma_resv_add_fence(&bo->base._resv, fence, DMA_RESV_USAGE_KERNEL);
+	dma_fence_put(fence);
 
-	dma_resv_unlock(bo->base.resv);
+out:
+	dma_resv_unlock(&bo->base._resv);
 }
 
 /**