@@ -55,7 +55,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
- $(AMDKFD_PATH)/kfd_crat.o
+ $(AMDKFD_PATH)/kfd_crat.o \
+ $(AMDKFD_PATH)/kfd_debug.o
ifneq ($(CONFIG_AMD_IOMMU_V2),)
AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
@@ -44,6 +44,7 @@
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
#include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
return -EPERM;
}
- process = kfd_create_process(filep);
+ process = kfd_create_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
+ if (kfd_process_init_cwsr_apu(process, filep)) {
+ kfd_unref_process(process);
+ return -EFAULT;
+ }
+
/* filep now owns the reference returned by kfd_create_process */
filep->private_data = process;
@@ -2737,6 +2743,10 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
{
struct kfd_ioctl_dbg_trap_args *args = data;
+ struct task_struct *thread = NULL;
+ struct mm_struct *mm = NULL;
+ struct pid *pid = NULL;
+ struct kfd_process *target = NULL;
int r = 0;
if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2744,9 +2754,81 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
return -EINVAL;
}
+ pid = find_get_pid(args->pid);
+ if (!pid) {
+ pr_debug("Cannot find pid info for %i\n", args->pid);
+ r = -ESRCH;
+ goto out;
+ }
+
+ thread = get_pid_task(pid, PIDTYPE_PID);
+ if (!thread) {
+ r = -ESRCH;
+ goto out;
+ }
+
+ mm = get_task_mm(thread);
+ if (!mm) {
+ r = -ESRCH;
+ goto out;
+ }
+
+ if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
+ bool create_process;
+
+ rcu_read_lock();
+ create_process = thread && thread != current && ptrace_parent(thread) == current;
+ rcu_read_unlock();
+
+ target = create_process ? kfd_create_process(thread) :
+ kfd_lookup_process_by_pid(pid);
+ } else {
+ target = kfd_lookup_process_by_pid(pid);
+ }
+
+ if (!target) {
+ pr_debug("Cannot find process PID %i to debug\n", args->pid);
+ r = -ESRCH;
+ goto out;
+ }
+
+ /* Check if target is still PTRACED. */
+ rcu_read_lock();
+ if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
+ && ptrace_parent(target->lead_thread) != current) {
+ pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
+ r = -EPERM;
+ }
+ rcu_read_unlock();
+
+ if (r)
+ goto out;
+
+ mutex_lock(&target->mutex);
+
+ if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
+ pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
+ r = -EINVAL;
+ goto unlock_out;
+ }
+
switch (args->op) {
case KFD_IOC_DBG_TRAP_ENABLE:
+ if (target != p)
+ target->debugger_process = p;
+
+ r = kfd_dbg_trap_enable(target,
+ args->enable.dbg_fd,
+ (void __user *)args->enable.rinfo_ptr,
+ &args->enable.rinfo_size);
+ if (!r)
+ target->exception_enable_mask = args->enable.exception_mask;
+
+ pr_warn("Debug functions limited\n");
+ break;
case KFD_IOC_DBG_TRAP_DISABLE:
+ r = kfd_dbg_trap_disable(target);
+ break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
@@ -2760,7 +2842,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
- pr_warn("Debugging not supported yet\n");
+ pr_warn("Debug op %i not supported yet\n", args->op);
r = -EACCES;
break;
default:
@@ -2768,6 +2850,22 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
r = -EINVAL;
}
+unlock_out:
+ mutex_unlock(&target->mutex);
+
+out:
+ if (thread)
+ put_task_struct(thread);
+
+ if (mm)
+ mmput(mm);
+
+ if (pid)
+ put_pid(pid);
+
+ if (target)
+ kfd_unref_process(target);
+
return r;
}
new file mode 100644
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_debug.h"
+#include <linux/file.h>
+
+int kfd_dbg_trap_disable(struct kfd_process *target)
+{
+ if (!target->debug_trap_enabled)
+ return 0;
+
+ fput(target->dbg_ev_file);
+ target->dbg_ev_file = NULL;
+
+ if (target->debugger_process) {
+ atomic_dec(&target->debugger_process->debugged_process_count);
+ target->debugger_process = NULL;
+ }
+
+ target->debug_trap_enabled = false;
+ kfd_unref_process(target);
+
+ return 0;
+}
+
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+ void __user *runtime_info, uint32_t *runtime_size)
+{
+ struct file *f;
+ uint32_t copy_size;
+ int r = 0;
+
+ if (target->debug_trap_enabled)
+ return -EALREADY;
+
+ copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
+
+ f = fget(fd);
+ if (!f) {
+ pr_err("Failed to get file for (%i)\n", fd);
+ return -EBADF;
+ }
+
+ target->dbg_ev_file = f;
+
+ /* We already hold the process reference but hold another one for the
+ * debug session.
+ */
+ kref_get(&target->ref);
+ target->debug_trap_enabled = true;
+
+ if (target->debugger_process)
+ atomic_inc(&target->debugger_process->debugged_process_count);
+
+ if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+ r = -EFAULT;
+
+ *runtime_size = sizeof(target->runtime_info);
+
+ return r;
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
+#define KFD_DEBUG_EVENTS_H_INCLUDED
+
+#include "kfd_priv.h"
+
+int kfd_dbg_trap_disable(struct kfd_process *target);
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+ void __user *runtime_info,
+ uint32_t *runtime_info_size);
+#endif
@@ -1006,6 +1006,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
goto out;
pdd = qpd_to_pdd(qpd);
+
+ /* The debugger creates processes that temporarily have not acquired
+ * all VMs for all devices and has no VMs itself.
+ * Skip queue eviction on process eviction.
+ */
+ if (!pdd->drm_priv)
+ goto out;
+
pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
pdd->process->pasid);
@@ -1127,13 +1135,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
{
struct queue *q;
struct kfd_process_device *pdd;
- uint64_t pd_base;
uint64_t eviction_duration;
int retval = 0;
pdd = qpd_to_pdd(qpd);
- /* Retrieve PD base */
- pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
@@ -1143,12 +1148,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
goto out;
}
+ /* The debugger creates processes that temporarily have not acquired
+ * all VMs for all devices and has no VMs itself.
+ * Skip queue restore on process restore.
+ */
+ if (!pdd->drm_priv)
+ goto vm_not_acquired;
+
pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
pdd->process->pasid);
/* Update PD Base in QPD */
- qpd->page_table_base = pd_base;
- pr_debug("Updated PD address to 0x%llx\n", pd_base);
+ qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
+ pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
@@ -1171,9 +1183,11 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
- qpd->evicted = 0;
+
eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
+vm_not_acquired:
+ qpd->evicted = 0;
out:
dqm_unlock(dqm);
return retval;
@@ -920,11 +920,33 @@ struct kfd_process {
*/
unsigned long last_restore_timestamp;
+ /* Indicates device process is debug attached with reserved vmid. */
+ bool debug_trap_enabled;
+
+ /* per-process-per device debug event fd file */
+ struct file *dbg_ev_file;
+
+ /* If the process is a kfd debugger, we need to know so we can clean
+ * up at exit time. If a process enables debugging on itself, it does
+ * its own clean-up, so we don't set the flag here. We track this by
+ * counting the number of processes this process is debugging.
+ */
+ atomic_t debugged_process_count;
+
+ /* If the process is a debugged, this is the debugger process */
+ struct kfd_process *debugger_process;
+
/* Kobj for our procfs */
struct kobject *kobj;
struct kobject *kobj_queues;
struct attribute attr_pasid;
+ /* Keep track cwsr init */
+ bool has_cwsr;
+
+ /* Exception code enable mask and status */
+ uint64_t exception_enable_mask;
+
/* shared virtual memory registered by this process */
struct svm_range_list svms;
@@ -933,6 +955,10 @@ struct kfd_process {
atomic_t poison;
/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
bool queues_paused;
+
+ /* Tracks runtime enable status */
+ struct kfd_runtime_info runtime_info;
+
};
#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -963,7 +989,7 @@ bool kfd_dev_is_large_bar(struct kfd_node *dev);
int kfd_process_create_wq(void);
void kfd_process_destroy_wq(void);
void kfd_cleanup_processes(void);
-struct kfd_process *kfd_create_process(struct file *filep);
+struct kfd_process *kfd_create_process(struct task_struct *thread);
struct kfd_process *kfd_get_process(const struct task_struct *task);
struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
@@ -1108,6 +1134,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
uint64_t tba_addr,
uint64_t tma_addr);
+/* CWSR initialization */
+int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
+
/* CRIU */
/*
* Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
@@ -44,6 +44,7 @@ struct mm_struct;
#include "kfd_iommu.h"
#include "kfd_svm.h"
#include "kfd_smi_events.h"
+#include "kfd_debug.h"
/*
* List of struct kfd_process (field kfd_process).
@@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
bool ref);
static void kfd_process_ref_release(struct kref *ref);
static struct kfd_process *create_process(const struct task_struct *thread);
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);
@@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
}
-struct kfd_process *kfd_create_process(struct file *filep)
+struct kfd_process *kfd_create_process(struct task_struct *thread)
{
struct kfd_process *process;
- struct task_struct *thread = current;
int ret;
- if (!thread->mm)
+ if (!(thread->mm && mmget_not_zero(thread->mm)))
return ERR_PTR(-EINVAL);
/* Only the pthreads threading model is supported. */
- if (thread->group_leader->mm != thread->mm)
+ if (thread->group_leader->mm != thread->mm) {
+ mmput(thread->mm);
return ERR_PTR(-EINVAL);
+ }
/*
* take kfd processes mutex before starting of process creation
@@ -833,10 +834,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
if (IS_ERR(process))
goto out;
- ret = kfd_process_init_cwsr_apu(process, filep);
- if (ret)
- goto out_destroy;
-
if (!procfs.kobj)
goto out;
@@ -870,16 +867,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
if (!IS_ERR(process))
kref_get(&process->ref);
mutex_unlock(&kfd_processes_mutex);
+ mmput(thread->mm);
return process;
-
-out_destroy:
- hash_del_rcu(&process->kfd_processes);
- mutex_unlock(&kfd_processes_mutex);
- synchronize_srcu(&kfd_processes_srcu);
- /* kfd_process_free_notifier will trigger the cleanup */
- mmu_notifier_put(&process->mmu_notifier);
- return ERR_PTR(ret);
}
struct kfd_process *kfd_get_process(const struct task_struct *thread)
@@ -1180,6 +1170,25 @@ static void kfd_process_notifier_release_internal(struct kfd_process *p)
/* Indicate to other users that MM is no longer valid */
p->mm = NULL;
+ kfd_dbg_trap_disable(p);
+
+ if (atomic_read(&p->debugged_process_count) > 0) {
+ struct kfd_process *target;
+ unsigned int temp;
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
+ if (target->debugger_process && target->debugger_process == p) {
+ mutex_lock_nested(&target->mutex, 1);
+ kfd_dbg_trap_disable(target);
+ mutex_unlock(&target->mutex);
+ if (atomic_read(&p->debugged_process_count) == 0)
+ break;
+ }
+ }
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+ }
mmu_notifier_put(&p->mmu_notifier);
}
@@ -1259,11 +1268,14 @@ void kfd_cleanup_processes(void)
mmu_notifier_synchronize();
}
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
{
unsigned long offset;
int i;
+ if (p->has_cwsr)
+ return 0;
+
for (i = 0; i < p->n_pdds; i++) {
struct kfd_node *dev = p->pdds[i]->dev;
struct qcm_process_device *qpd = &p->pdds[i]->qpd;
@@ -1292,6 +1304,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
}
+ p->has_cwsr = true;
+
return 0;
}
@@ -1434,6 +1448,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
if (err)
goto err_event_init;
process->is_32bit_user_mode = in_compat_syscall();
+ process->debug_trap_enabled = false;
+ process->debugger_process = NULL;
+ process->exception_enable_mask = 0;
+ atomic_set(&process->debugged_process_count, 0);
process->pasid = kfd_pasid_alloc();
if (process->pasid == 0) {
@@ -1967,8 +1985,10 @@ static void restore_process_worker(struct work_struct *work)
*/
p->last_restore_timestamp = get_jiffies_64();
- ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
- &p->ef);
+ /* VMs may not have been acquired yet during debugging. */
+ if (p->kgd_process_info)
+ ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
+ &p->ef);
if (ret) {
pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
p->pasid, PROCESS_BACK_OFF_TIME_MS);