diff mbox series

[RFC,05/17] drm/amdkfd: CRIU Implement KFD dumper ioctl

Message ID 20210501015752.888-6-Felix.Kuehling@amd.com (mailing list archive)
State New, archived
Headers show
Series CRIU support for ROCm | expand

Commit Message

Felix Kuehling May 1, 2021, 1:57 a.m. UTC
From: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>

This adds support to discover the buffer objects that belong to a
process being checkpointed. The data corresponding to these buffer
objects is returned to user space plugin running under criu master
context which then stores this info to recreate these buffer objects
during a restore operation.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
(cherry picked from commit 1f114a541bd21873de905db64bb9efa673274d4b)
(cherry picked from commit 20c435fad57d3201e5402e38ae778f1f0f84a09d)
Change-Id: Ifdbeee3606578db61bdc9aca7528272e20a38256
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 96 +++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6b347ce5992f..1454f5613e60 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -42,6 +42,7 @@ 
 #include "kfd_svm.h"
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
+#include "amdgpu_object.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1806,9 +1807,102 @@  static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
 static int kfd_ioctl_criu_dumper(struct file *filep,
 				struct kfd_process *p, void *data)
 {
+	struct kfd_ioctl_criu_dumper_args *args = data;
+	struct kfd_criu_bo_buckets *bo_bucket;
+	struct amdgpu_bo *dumper_bo;
+	int ret, id, index, i = 0;
+	struct kgd_mem *kgd_mem;
+	void *mem;
+
 	pr_info("Inside %s\n",__func__);
 
-	return 0;
+	if (args->num_of_bos == 0) {
+		pr_err("No BOs to be dumped\n");
+		return -EINVAL;
+	}
+
+	if (p->n_pdds != args->num_of_devices) {
+		pr_err("Invalid number of devices %d (expected = %d)\n",
+								args->num_of_devices, p->n_pdds);
+		return -EINVAL;
+	}
+
+	pr_debug("num of bos = %llu\n", args->num_of_bos);
+
+	bo_bucket = kvzalloc((sizeof(struct kfd_criu_bo_buckets) *
+			     args->num_of_bos), GFP_KERNEL);
+	if (!bo_bucket)
+		return -ENOMEM;
+
+	mutex_lock(&p->mutex);
+
+	if (!kfd_has_process_device_data(p)) {
+		pr_err("No pdd for given process\n");
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	/* Run over all PDDs of the process */
+	for (index = 0; index < p->n_pdds; index++) {
+		struct kfd_process_device *pdd = p->pdds[index];
+
+		idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+			if (!mem) {
+				ret = -ENOMEM;
+				goto err_unlock;
+			}
+
+			kgd_mem = (struct kgd_mem *)mem;
+			dumper_bo = kgd_mem->bo;
+
+			if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) {
+				if (i >= args->num_of_bos) {
+					pr_err("Num of BOs changed since last helper ioctl call\n");
+					ret = -EINVAL;
+					goto err_unlock;
+				}
+
+				bo_bucket[i].bo_addr = (uint64_t)kgd_mem->va;
+				bo_bucket[i].bo_size = amdgpu_bo_size(dumper_bo);
+				bo_bucket[i].gpu_id = pdd->dev->id;
+				bo_bucket[i].bo_alloc_flags = (uint32_t)kgd_mem->alloc_flags;
+				bo_bucket[i].idr_handle = id;
+
+				if (bo_bucket[i].bo_alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_DOORBELL |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else if (bo_bucket[i].bo_alloc_flags &
+				    KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
+					bo_bucket[i].bo_offset = KFD_MMAP_TYPE_MMIO |
+						KFD_MMAP_GPU_ID(pdd->dev->id);
+				else
+					bo_bucket[i].bo_offset = amdgpu_bo_mmap_offset(dumper_bo);
+
+				pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
+					 "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
+					 bo_bucket[i].bo_size,
+					 bo_bucket[i].bo_addr,
+					 bo_bucket[i].bo_offset,
+					 bo_bucket[i].gpu_id,
+					 bo_bucket[i].bo_alloc_flags,
+					 bo_bucket[i].idr_handle);
+				i++;
+			}
+		}
+	}
+
+	ret = copy_to_user((void __user *)args->kfd_criu_bo_buckets_ptr,
+			bo_bucket,
+			(args->num_of_bos *
+			 sizeof(struct kfd_criu_bo_buckets)));
+	kvfree(bo_bucket);
+	mutex_unlock(&p->mutex);
+	return ret ? -EFAULT : 0;
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+	pr_err("Dumper ioctl failed err:%d\n", ret);
+	return ret;
 }
 
 static int kfd_ioctl_criu_restorer(struct file *filep,