diff mbox series

[03/11] drm/amdkfd: Improve amdgpu_vm_handle_moved

Message ID 20231017211337.1593869-4-Felix.Kuehling@amd.com (mailing list archive)
State New, archived
Headers show
Series Enable integration of KFD with DRM GEM_VA ioctl | expand

Commit Message

Felix Kuehling Oct. 17, 2023, 9:13 p.m. UTC
Let amdgpu_vm_handle_moved update all BO VA mappings of BOs reserved by
the caller. This will be useful for handling extra BO VA mappings in
KFD VMs that are managed through the render node API.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 22 +--------------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 19 +++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  3 ++-
 4 files changed, 18 insertions(+), 28 deletions(-)

Comments

Felix Kuehling Nov. 1, 2023, 5:09 p.m. UTC | #1
On 2023-10-17 17:13, Felix Kuehling wrote:
> Let amdgpu_vm_handle_moved update all BO VA mappings of BOs reserved by
> the caller. This will be useful for handling extra BO VA mappings in
> KFD VMs that are managed through the render node API.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Reviewed-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 22 +--------------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 19 +++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  3 ++-
>   4 files changed, 18 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 74769afaa33d..c8f2907ebd6f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1113,7 +1113,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   	struct amdgpu_vm *vm = &fpriv->vm;
>   	struct amdgpu_bo_list_entry *e;
>   	struct amdgpu_bo_va *bo_va;
> -	struct amdgpu_bo *bo;
>   	unsigned int i;
>   	int r;
>   
> @@ -1141,26 +1140,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>   			return r;
>   	}
>   
> -	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
> -		/* ignore duplicates */
> -		bo = ttm_to_amdgpu_bo(e->tv.bo);
> -		if (!bo)
> -			continue;
> -
> -		bo_va = e->bo_va;
> -		if (bo_va == NULL)
> -			continue;
> -
> -		r = amdgpu_vm_bo_update(adev, bo_va, false);
> -		if (r)
> -			return r;
> -
> -		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> -		if (r)
> -			return r;
> -	}

FYI, removing this loop seemed to cause PSDB failures, mostly in RADV 
tests. It may have been a glitch in the infrastructure, but the failures 
were consistent on the three subsequent patches, too. Restoring this 
loop seems to make the tests pass, so I'll do that for now. I don't have 
time to debug CS with RADV, and this change is not needed for my ROCm work.

Regards,
   Felix


> -
> -	r = amdgpu_vm_handle_moved(adev, vm);
> +	r = amdgpu_vm_handle_moved(adev, vm, &p->ticket);
>   	if (r)
>   		return r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> index b5e28fa3f414..e7e87a3b2601 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
> @@ -409,7 +409,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
>   		if (!r)
>   			r = amdgpu_vm_clear_freed(adev, vm, NULL);
>   		if (!r)
> -			r = amdgpu_vm_handle_moved(adev, vm);
> +			r = amdgpu_vm_handle_moved(adev, vm, ticket);
>   
>   		if (r && r != -EBUSY)
>   			DRM_ERROR("Failed to invalidate VM page tables (%d))\n",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index d72daf15662f..c586d0e93d75 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1285,6 +1285,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>    *
>    * @adev: amdgpu_device pointer
>    * @vm: requested vm
> + * @ticket: optional reservation ticket used to reserve the VM
>    *
>    * Make sure all BOs which are moved are updated in the PTs.
>    *
> @@ -1294,11 +1295,12 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>    * PTs have to be reserved!
>    */
>   int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
> -			   struct amdgpu_vm *vm)
> +			   struct amdgpu_vm *vm,
> +			   struct ww_acquire_ctx *ticket)
>   {
>   	struct amdgpu_bo_va *bo_va;
>   	struct dma_resv *resv;
> -	bool clear;
> +	bool clear, unlock;
>   	int r;
>   
>   	spin_lock(&vm->status_lock);
> @@ -1321,17 +1323,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
>   		spin_unlock(&vm->status_lock);
>   
>   		/* Try to reserve the BO to avoid clearing its ptes */
> -		if (!adev->debug_vm && dma_resv_trylock(resv))
> +		if (!adev->debug_vm && dma_resv_trylock(resv)) {
>   			clear = false;
> +			unlock = true;
> +		/* The caller is already holding the reservation lock */
> +		} else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
> +			clear = false;
> +			unlock = false;
>   		/* Somebody else is using the BO right now */
> -		else
> +		} else {
>   			clear = true;
> +			unlock = false;
> +		}
>   
>   		r = amdgpu_vm_bo_update(adev, bo_va, clear);
>   		if (r)
>   			return r;
>   
> -		if (!clear)
> +		if (unlock)
>   			dma_resv_unlock(resv);
>   		spin_lock(&vm->status_lock);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 6e71978db13f..ebcc75132b74 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -432,7 +432,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
>   			  struct amdgpu_vm *vm,
>   			  struct dma_fence **fence);
>   int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
> -			   struct amdgpu_vm *vm);
> +			   struct amdgpu_vm *vm,
> +			   struct ww_acquire_ctx *ticket);
>   void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
>   			    struct amdgpu_vm *vm, struct amdgpu_bo *bo);
>   int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 74769afaa33d..c8f2907ebd6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1113,7 +1113,6 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 	struct amdgpu_vm *vm = &fpriv->vm;
 	struct amdgpu_bo_list_entry *e;
 	struct amdgpu_bo_va *bo_va;
-	struct amdgpu_bo *bo;
 	unsigned int i;
 	int r;
 
@@ -1141,26 +1140,7 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 			return r;
 	}
 
-	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
-		/* ignore duplicates */
-		bo = ttm_to_amdgpu_bo(e->tv.bo);
-		if (!bo)
-			continue;
-
-		bo_va = e->bo_va;
-		if (bo_va == NULL)
-			continue;
-
-		r = amdgpu_vm_bo_update(adev, bo_va, false);
-		if (r)
-			return r;
-
-		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
-		if (r)
-			return r;
-	}
-
-	r = amdgpu_vm_handle_moved(adev, vm);
+	r = amdgpu_vm_handle_moved(adev, vm, &p->ticket);
 	if (r)
 		return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index b5e28fa3f414..e7e87a3b2601 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -409,7 +409,7 @@  amdgpu_dma_buf_move_notify(struct dma_buf_attachment *attach)
 		if (!r)
 			r = amdgpu_vm_clear_freed(adev, vm, NULL);
 		if (!r)
-			r = amdgpu_vm_handle_moved(adev, vm);
+			r = amdgpu_vm_handle_moved(adev, vm, ticket);
 
 		if (r && r != -EBUSY)
 			DRM_ERROR("Failed to invalidate VM page tables (%d))\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d72daf15662f..c586d0e93d75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1285,6 +1285,7 @@  int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @ticket: optional reservation ticket used to reserve the VM
  *
  * Make sure all BOs which are moved are updated in the PTs.
  *
@@ -1294,11 +1295,12 @@  int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
  * PTs have to be reserved!
  */
 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
-			   struct amdgpu_vm *vm)
+			   struct amdgpu_vm *vm,
+			   struct ww_acquire_ctx *ticket)
 {
 	struct amdgpu_bo_va *bo_va;
 	struct dma_resv *resv;
-	bool clear;
+	bool clear, unlock;
 	int r;
 
 	spin_lock(&vm->status_lock);
@@ -1321,17 +1323,24 @@  int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
 		spin_unlock(&vm->status_lock);
 
 		/* Try to reserve the BO to avoid clearing its ptes */
-		if (!adev->debug_vm && dma_resv_trylock(resv))
+		if (!adev->debug_vm && dma_resv_trylock(resv)) {
 			clear = false;
+			unlock = true;
+		/* The caller is already holding the reservation lock */
+		} else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
+			clear = false;
+			unlock = false;
 		/* Somebody else is using the BO right now */
-		else
+		} else {
 			clear = true;
+			unlock = false;
+		}
 
 		r = amdgpu_vm_bo_update(adev, bo_va, clear);
 		if (r)
 			return r;
 
-		if (!clear)
+		if (unlock)
 			dma_resv_unlock(resv);
 		spin_lock(&vm->status_lock);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 6e71978db13f..ebcc75132b74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -432,7 +432,8 @@  int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
 			  struct amdgpu_vm *vm,
 			  struct dma_fence **fence);
 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
-			   struct amdgpu_vm *vm);
+			   struct amdgpu_vm *vm,
+			   struct ww_acquire_ctx *ticket);
 void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
 			    struct amdgpu_vm *vm, struct amdgpu_bo *bo);
 int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,