diff mbox series

[10/44] drm/amdkfd: map svm range to GPUs

Message ID 20210322105900.14068-11-Felix.Kuehling@amd.com (mailing list archive)
State New, archived
Headers show
Series Add HMM-based SVM memory manager to KFD v2 | expand

Commit Message

Felix Kuehling March 22, 2021, 10:58 a.m. UTC
Use amdgpu_vm_bo_update_mapping to update GPU page table to map or unmap
svm range system memory pages address to GPUs.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 395 +++++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h |   4 +
 2 files changed, 374 insertions(+), 25 deletions(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e23171ac866a..1244ba380292 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -98,11 +98,99 @@  static void svm_range_remove_notifier(struct svm_range *prange)
 		mmu_interval_notifier_remove(&prange->notifier);
 }
 
+static int
+svm_range_dma_map(struct device *dev, dma_addr_t **dma_addr,
+		  unsigned long *pages_addr, uint64_t npages)
+{
+	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
+	dma_addr_t *addr = *dma_addr;
+	struct page *page;
+	int i, r;
+
+	if (!pages_addr)
+		return 0;
+
+	if (!addr) {
+		addr = kvmalloc_array(npages, sizeof(*addr),
+				      GFP_KERNEL | __GFP_ZERO);
+		if (!addr)
+			return -ENOMEM;
+		*dma_addr = addr;
+	}
+
+	for (i = 0; i < npages; i++) {
+		if (WARN_ONCE(addr[i] && !dma_mapping_error(dev, addr[i]),
+			      "leaking dma mapping\n"))
+			dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
+
+		page = hmm_pfn_to_page(pages_addr[i]);
+		addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
+		r = dma_mapping_error(dev, addr[i]);
+		if (r) {
+			pr_debug("failed %d dma_map_page\n", r);
+			return r;
+		}
+		pr_debug("dma mapping 0x%llx for page addr 0x%lx\n",
+			 addr[i] >> PAGE_SHIFT, page_to_pfn(page));
+	}
+	return 0;
+}
+
+void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+			 unsigned long offset, unsigned long npages)
+{
+	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
+	int i;
+
+	if (!dma_addr)
+		return;
+
+	for (i = offset; i < offset + npages; i++) {
+		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
+			continue;
+		pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
+		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
+		dma_addr[i] = 0;
+	}
+}
+
+static void svm_range_free_dma_mappings(struct svm_range *prange)
+{
+	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
+	struct kfd_dev *kfd_dev;
+	dma_addr_t *dma_addr;
+	struct device *dev;
+	struct kfd_process *p;
+	uint32_t gpuidx;
+	int r;
+
+	p = container_of(prange->svms, struct kfd_process, svms);
+	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+		  MAX_GPU_INSTANCE);
+
+	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+		dma_addr = prange->dma_addr[gpuidx];
+		if (!dma_addr)
+			continue;
+
+		r = kfd_process_device_from_gpuidx(p, gpuidx, &kfd_dev);
+		if (r) {
+			pr_debug("failed to find device idx %d\n", gpuidx);
+			return;
+		}
+		dev = &kfd_dev->pdev->dev;
+		svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
+		kvfree(dma_addr);
+		prange->dma_addr[gpuidx] = NULL;
+	}
+}
+
 static void svm_range_free(struct svm_range *prange)
 {
 	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
 		 prange->start, prange->last);
 
+	svm_range_free_dma_mappings(prange);
 	kvfree(prange->pages_addr);
 	kfree(prange);
 }
@@ -342,41 +430,62 @@  svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new)
 }
 
 static int
-svm_range_split_pages(struct svm_range *new, struct svm_range *old,
-		      uint64_t start, uint64_t last)
+svm_range_split_array(void *ppnew, void *ppold, size_t size,
+		      uint64_t old_start, uint64_t old_n,
+		      uint64_t new_start, uint64_t new_n)
 {
-	unsigned long old_start;
-	unsigned long *pages_addr;
+	unsigned char *new, *old, *pold;
 	uint64_t d;
 
-	old_start = old->start;
-	new->pages_addr = kvmalloc_array(new->npages,
-					 sizeof(*new->pages_addr),
-					 GFP_KERNEL | __GFP_ZERO);
-	if (!new->pages_addr)
-		return -ENOMEM;
+	if (!ppold)
+		return 0;
+	pold = *(unsigned char **)ppold;
+	if (!pold)
+		return 0;
 
-	d = new->start - old_start;
-	memcpy(new->pages_addr, old->pages_addr + d,
-	       new->npages * sizeof(*new->pages_addr));
+	new = kvmalloc_array(new_n, size, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
 
-	old->npages = last - start + 1;
-	old->start = start;
-	old->last = last;
+	d = (new_start - old_start) * size;
+	memcpy(new, pold + d, new_n * size);
 
-	pages_addr = kvmalloc_array(old->npages, sizeof(*pages_addr),
-				    GFP_KERNEL);
-	if (!pages_addr) {
-		kvfree(new->pages_addr);
+	old = kvmalloc_array(old_n, size, GFP_KERNEL);
+	if (!old) {
+		kvfree(new);
 		return -ENOMEM;
 	}
 
-	d = start - old_start;
-	memcpy(pages_addr, old->pages_addr + d,
-	       old->npages * sizeof(*pages_addr));
+	d = (new_start == old_start) ? new_n * size : 0;
+	memcpy(old, pold + d, old_n * size);
 
-	kvfree(old->pages_addr);
-	old->pages_addr = pages_addr;
+	kvfree(pold);
+	*(void **)ppold = old;
+	*(void **)ppnew = new;
+
+	return 0;
+}
+
+static int
+svm_range_split_pages(struct svm_range *new, struct svm_range *old,
+		      uint64_t start, uint64_t last)
+{
+	uint64_t npages = last - start + 1;
+	int i, r;
+
+	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
+		r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
+					  sizeof(*old->dma_addr[i]), old->start,
+					  npages, new->start, new->npages);
+		if (r)
+			return r;
+	}
+
+	r = svm_range_split_array(&new->pages_addr, &old->pages_addr,
+				  sizeof(*old->pages_addr), old->start, npages,
+				  new->start, new->npages);
+	if (r)
+		return r;
 
 	return 0;
 }
@@ -525,6 +634,234 @@  void svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
 	list_add_tail(&pchild->child_list, &prange->child_list);
 }
 
+static uint64_t
+svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange)
+{
+	uint32_t flags = prange->flags;
+	uint32_t mapping_flags;
+	uint64_t pte_flags;
+
+	pte_flags = AMDGPU_PTE_VALID;
+	pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED;
+
+	mapping_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
+
+	if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
+		mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
+	if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
+		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
+	if (flags & KFD_IOCTL_SVM_FLAG_COHERENT)
+		mapping_flags |= AMDGPU_VM_MTYPE_UC;
+	else
+		mapping_flags |= AMDGPU_VM_MTYPE_NC;
+
+	/* TODO: add CHIP_ARCTURUS new flags for vram mapping */
+
+	pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags);
+
+	/* Apply ASIC specific mapping flags */
+	amdgpu_gmc_get_vm_pte(adev, &prange->mapping, &pte_flags);
+
+	pr_debug("PTE flags 0x%llx\n", pte_flags);
+
+	return pte_flags;
+}
+
+static int
+svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+			 uint64_t start, uint64_t last,
+			 struct dma_fence **fence)
+{
+	uint64_t init_pte_value = 0;
+
+	pr_debug("[0x%llx 0x%llx]\n", start, last);
+
+	return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL,
+					   start, last, init_pte_value, 0,
+					   NULL, NULL, fence);
+}
+
+static int
+svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
+			  unsigned long last)
+{
+	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
+	struct kfd_process_device *pdd;
+	struct dma_fence *fence = NULL;
+	struct amdgpu_device *adev;
+	struct kfd_process *p;
+	struct kfd_dev *dev;
+	uint32_t gpuidx;
+	int r = 0;
+
+	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+		  MAX_GPU_INSTANCE);
+	p = container_of(prange->svms, struct kfd_process, svms);
+
+	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+		pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
+		r = kfd_process_device_from_gpuidx(p, gpuidx, &dev);
+		if (r) {
+			pr_debug("failed to find device idx %d\n", gpuidx);
+			return -EINVAL;
+		}
+
+		pdd = kfd_get_process_device_data(dev, p);
+		if (!pdd)
+			return -EINVAL;
+
+		adev = (struct amdgpu_device *)dev->kgd;
+
+		r = svm_range_unmap_from_gpu(adev, pdd->vm, start, last,
+					     &fence);
+		if (r)
+			break;
+
+		if (fence) {
+			r = dma_fence_wait(fence, false);
+			dma_fence_put(fence);
+			fence = NULL;
+			if (r)
+				break;
+		}
+
+		svm_range_dma_unmap(adev->dev, prange->dma_addr[gpuidx],
+				    start - prange->start,
+				    last - start + 1);
+
+		amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev,
+						  p->pasid);
+	}
+
+	return r;
+}
+
+static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
+{
+	struct ttm_operation_ctx ctx = { false, false };
+
+	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
+
+	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+}
+
+static int
+svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+		     struct svm_range *prange, dma_addr_t *pages_addr,
+		     bool reserve_vm, struct dma_fence **fence)
+{
+	struct amdgpu_bo *root;
+	uint64_t pte_flags;
+	int r = 0;
+
+	pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
+		 prange->last);
+
+	if (reserve_vm) {
+		root = amdgpu_bo_ref(vm->root.base.bo);
+		r = amdgpu_bo_reserve(root, true);
+		if (r) {
+			pr_debug("failed %d to reserve root bo\n", r);
+			amdgpu_bo_unref(&root);
+			goto out;
+		}
+		r = amdgpu_vm_validate_pt_bos(adev, vm, svm_range_bo_validate,
+					      NULL);
+		if (r) {
+			pr_debug("failed %d validate pt bos\n", r);
+			goto unreserve_out;
+		}
+	}
+
+	prange->mapping.start = prange->start;
+	prange->mapping.last = prange->last;
+	prange->mapping.offset = 0;
+	pte_flags = svm_range_get_pte_flags(adev, prange);
+	prange->mapping.flags = pte_flags;
+
+	r = amdgpu_vm_bo_update_mapping(adev, adev, vm, false, false, NULL,
+					prange->mapping.start,
+					prange->mapping.last, pte_flags,
+					prange->mapping.offset, NULL,
+					pages_addr, &vm->last_update);
+	if (r) {
+		pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
+		goto unreserve_out;
+	}
+
+
+	r = amdgpu_vm_update_pdes(adev, vm, false);
+	if (r) {
+		pr_debug("failed %d to update directories 0x%lx\n", r,
+			 prange->start);
+		goto unreserve_out;
+	}
+
+	if (fence)
+		*fence = dma_fence_get(vm->last_update);
+
+unreserve_out:
+	if (reserve_vm) {
+		amdgpu_bo_unreserve(root);
+		amdgpu_bo_unref(&root);
+	}
+
+out:
+	return r;
+}
+
+static int svm_range_map_to_gpus(struct svm_range *prange, bool reserve_vm)
+{
+	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
+	struct kfd_process_device *pdd;
+	struct amdgpu_device *adev;
+	struct kfd_process *p;
+	struct kfd_dev *dev;
+	struct dma_fence *fence = NULL;
+	uint32_t gpuidx;
+	int r = 0;
+
+	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
+		  MAX_GPU_INSTANCE);
+	p = container_of(prange->svms, struct kfd_process, svms);
+
+	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
+		r = kfd_process_device_from_gpuidx(p, gpuidx, &dev);
+		if (r) {
+			pr_debug("failed to find device idx %d\n", gpuidx);
+			return -EINVAL;
+		}
+
+		pdd = kfd_bind_process_to_device(dev, p);
+		if (IS_ERR(pdd))
+			return -EINVAL;
+		adev = (struct amdgpu_device *)dev->kgd;
+
+		r = svm_range_dma_map(adev->dev, &prange->dma_addr[gpuidx],
+				      prange->pages_addr, prange->npages);
+		if (r)
+			break;
+
+		r = svm_range_map_to_gpu(adev, pdd->vm, prange,
+					 prange->dma_addr[gpuidx], reserve_vm,
+					 &fence);
+		if (r)
+			break;
+
+		if (fence) {
+			r = dma_fence_wait(fence, false);
+			dma_fence_put(fence);
+			fence = NULL;
+			if (r) {
+				pr_debug("failed %d to dma fence wait\n", r);
+				break;
+			}
+		}
+	}
+
+	return r;
+}
+
 /**
  * svm_range_list_lock_and_flush_work - flush pending deferred work
  *
@@ -897,6 +1234,8 @@  svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
 	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
 		 prange, prange->start, prange->last, start, last);
 
+	svm_range_unmap_from_gpus(prange, start, last);
+
 	unmap_parent = start <= prange->start && last >= prange->last;
 
 	list_for_each_entry(pchild, &prange->child_list, child_list)
@@ -1138,6 +1477,12 @@  svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
 			pr_debug("failed %d to validate svm range\n", r);
 			break;
 		}
+
+		r = svm_range_map_to_gpus(prange, true);
+		if (r) {
+			pr_debug("failed %d to map svm range\n", r);
+			break;
+		}
 	}
 
 	svm_range_debug_dump(svms);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index f9b81245b94a..596c881f7674 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -57,8 +57,10 @@  struct svm_work_list_item {
  * @remove_list:link list node used to add to remove list
  * @insert_list:link list node used to add to insert list
  * @hmm_range:  hmm range structure used by hmm_range_fault to get system pages
+ * @mapping:    bo_va mapping structure to create and update GPU page table
  * @npages:     number of pages
  * @pages_addr: list of system memory physical page address
+ * @dma_addr:   dma mapping address on each GPU for system memory physical page
  * @flags:      flags defined as KFD_IOCTL_SVM_FLAG_*
  * @perferred_loc: perferred location, 0 for CPU, or GPU id
  * @perfetch_loc: last prefetch location, 0 for CPU, or GPU id
@@ -85,8 +87,10 @@  struct svm_range {
 	struct list_head		remove_list;
 	struct list_head		insert_list;
 	struct hmm_range		*hmm_range;
+	struct amdgpu_bo_va_mapping	mapping;
 	uint64_t			npages;
 	unsigned long			*pages_addr;
+	dma_addr_t			*dma_addr[MAX_GPU_INSTANCE];
 	uint32_t			flags;
 	uint32_t			preferred_loc;
 	uint32_t			prefetch_loc;