diff mbox series

[03/32] drm/amdkfd: prepare per-process debug enable and disable

Message ID 20230125195401.4183544-4-jonathan.kim@amd.com (mailing list archive)
State New, archived
Headers show
Series Upstream of kernel support for AMDGPU ISA debugging | expand

Commit Message

Kim, Jonathan Jan. 25, 2023, 7:53 p.m. UTC
The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.
Some events will be synchronously raised while other are scheduled,
which is why a debug_event_workarea worker is initialized.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
remove unnecessary queue eviction counter reset when there's nothing
to evict.
change err code to EALREADY if attaching to an already attached process.
move debug disable to release worker to avoid race with disable from
ioctl call.

v2: relax debug trap disable and PTRACE ATTACH requirement.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94 +++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
 7 files changed, 308 insertions(+), 29 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h

Comments

Felix Kuehling Feb. 16, 2023, 11:44 p.m. UTC | #1
On 2023-01-25 14:53, Jonathan Kim wrote:
> The ROCm debugger will attach to a process to debug by PTRACE and will
> expect the KFD to prepare a process for the target PID, whether the
> target PID has opened the KFD device or not.
>
> This patch is to explicity handle this requirement.  Further HW mode
> setting and runtime coordination requirements will be handled in
> following patches.
>
> In the case where the target process has not opened the KFD device,
> a new KFD process must be created for the target PID.
> The debugger as well as the target process for this case will have not
> acquired any VMs so handle process restoration to correctly account for
> this.
>
> To coordinate with HSA runtime, the debugger must be aware of the target
> process' runtime enablement status and will copy the runtime status
> information into the debugged KFD process for later query.
>
> On enablement, the debugger will subscribe to a set of exceptions where
> each exception events will notify the debugger through a pollable FIFO
> file descriptor that the debugger provides to the KFD to manage.
> Some events will be synchronously raised while other are scheduled,
> which is why a debug_event_workarea worker is initialized.
>
> Finally on process termination of either the debugger or the target,
> debugging must be disabled if it has not been done so.
>
> v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
> remove unnecessary queue eviction counter reset when there's nothing
> to evict.
> change err code to EALREADY if attaching to an already attached process.
> move debug disable to release worker to avoid race with disable from
> ioctl call.
>
> v2: relax debug trap disable and PTRACE ATTACH requirement.
>
> Signed-off-by: Jonathan Kim<jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
>   7 files changed, 308 insertions(+), 29 deletions(-)
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
> index e758c2a24cd0..747754428073 100644
> --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> @@ -55,7 +55,8 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v9.o \
>   		$(AMDKFD_PATH)/kfd_int_process_v11.o \
>   		$(AMDKFD_PATH)/kfd_smi_events.o \
> -		$(AMDKFD_PATH)/kfd_crat.o
> +		$(AMDKFD_PATH)/kfd_crat.o \
> +		$(AMDKFD_PATH)/kfd_debug.o
>   
>   ifneq ($(CONFIG_AMD_IOMMU_V2),)
>   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d3b019e64093..ee05c2e54ef6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -44,6 +44,7 @@
>   #include "amdgpu_amdkfd.h"
>   #include "kfd_smi_events.h"
>   #include "amdgpu_dma_buf.h"
> +#include "kfd_debug.h"
>   
>   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>   static int kfd_open(struct inode *, struct file *);
> @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> -	process = kfd_create_process(filep);
> +	process = kfd_create_process(current);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> +	if (kfd_process_init_cwsr_apu(process, filep)) {
> +		kfd_unref_process(process);
> +		return -EFAULT;
> +	}
> +
>   	if (kfd_is_locked()) {
>   		dev_dbg(kfd_device, "kfd is locked!\n"
>   				"process %d unreferenced", process->pasid);
> @@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
>   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
>   {
>   	struct kfd_ioctl_dbg_trap_args *args = data;
> +	struct task_struct *thread = NULL;
> +	struct pid *pid = NULL;
> +	struct kfd_process *target = NULL;
>   	int r = 0;
>   
>   	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> @@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		return -EINVAL;
>   	}
>   
> +	pid = find_get_pid(args->pid);
> +	if (!pid) {
> +		pr_debug("Cannot find pid info for %i\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	thread = get_pid_task(pid, PIDTYPE_PID);
> +
> +	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> +		bool create_process;
> +
> +		rcu_read_lock();
> +		create_process = thread && thread != current && ptrace_parent(thread) == current;
> +		rcu_read_unlock();
> +
> +		target = create_process ? kfd_create_process(thread) :
> +					kfd_lookup_process_by_pid(pid);
> +	} else {
> +		target = kfd_lookup_process_by_pid(pid);
> +	}
> +
> +	if (!target) {
> +		pr_debug("Cannot find process PID %i to debug\n", args->pid);
> +		r = -ESRCH;
> +		goto out;
> +	}
> +
> +	/* Check if target is still PTRACED. */
> +	rcu_read_lock();
> +	if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
> +				&& ptrace_parent(target->lead_thread) != current) {
> +		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
> +		r = -EPERM;
> +	}
> +	rcu_read_unlock();
> +
> +	if (r)
> +		goto out;
> +
> +	mutex_lock(&target->mutex);
> +
> +	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
> +		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
> +		r = -EINVAL;
> +		goto unlock_out;
> +	}
> +
>   	switch (args->op) {
>   	case KFD_IOC_DBG_TRAP_ENABLE:
> +		if (target != p)
> +			target->debugger_process = p;
> +
> +		r = kfd_dbg_trap_enable(target,
> +					args->enable.dbg_fd,
> +					(void __user *)args->enable.rinfo_ptr,
> +					&args->enable.rinfo_size);
> +		if (!r)
> +			target->exception_enable_mask = args->enable.exception_mask;
> +
> +		pr_warn("Debug functions limited\n");
> +		break;
>   	case KFD_IOC_DBG_TRAP_DISABLE:
> +		r = kfd_dbg_trap_disable(target);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
>   	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> @@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
>   	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debugging not supported yet\n");
> +		pr_warn("Debug op %i not supported yet\n", args->op);
>   		r = -EACCES;
>   		break;
>   	default:
> @@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   		r = -EINVAL;
>   	}
>   
> +unlock_out:
> +	mutex_unlock(&target->mutex);
> +
> +out:
> +	if (thread)
> +		put_task_struct(thread);
> +
> +	if (pid)
> +		put_pid(pid);
> +
> +	if (target)
> +		kfd_unref_process(target);
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> new file mode 100644
> index 000000000000..f6ea6db266b4
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -0,0 +1,94 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_debug.h"
> +#include <linux/file.h>
> +
> +void debug_event_write_work_handler(struct work_struct *work)
> +{
> +	struct kfd_process *process;
> +
> +	static const char write_data = '.';
> +	loff_t pos = 0;
> +
> +	process = container_of(work,
> +			struct kfd_process,
> +			debug_event_workarea);
> +
> +	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> +}
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target)
> +{
> +	if (!target->debug_trap_enabled)
> +		return 0;
> +
> +	fput(target->dbg_ev_file);
> +	target->dbg_ev_file = NULL;
> +
> +	if (target->debugger_process) {
> +		atomic_dec(&target->debugger_process->debugged_process_count);
> +		target->debugger_process = NULL;
> +	}
> +
> +	target->debug_trap_enabled = false;
> +	kfd_unref_process(target);
> +
> +	return 0;
> +}
> +
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info, uint32_t *runtime_size)
> +{
> +	struct file *f;
> +	uint32_t copy_size;
> +	int r = 0;
> +
> +	if (target->debug_trap_enabled)
> +		return -EALREADY;
> +
> +	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
> +
> +	f = fget(fd);
> +	if (!f) {
> +		pr_err("Failed to get file for (%i)\n", fd);
> +		return -EBADF;
> +	}
> +
> +	target->dbg_ev_file = f;
> +
> +	/* We already hold the process reference but hold another one for the
> +	 * debug session.
> +	 */
> +	kref_get(&target->ref);
> +	target->debug_trap_enabled = true;
> +
> +	if (target->debugger_process)
> +		atomic_inc(&target->debugger_process->debugged_process_count);
> +
> +	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
> +		r = -EFAULT;
> +
> +	*runtime_size = sizeof(target->runtime_info);
> +
> +	return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> new file mode 100644
> index 000000000000..b2217eb1399c
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright 2022 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> +#define KFD_DEBUG_EVENTS_H_INCLUDED
> +
> +#include "kfd_priv.h"
> +
> +int kfd_dbg_trap_disable(struct kfd_process *target);
> +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> +			void __user *runtime_info,
> +			uint32_t *runtime_info_size);
> +void debug_event_write_work_handler(struct work_struct *work);
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c06ada0844ba..a2ac98d06e71 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   
>   	pdd = qpd_to_pdd(qpd);
> +
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue eviction on process eviction.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto out;
> +
This should be before qpd->
>   	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
> @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	struct queue *q;
>   	struct kfd_process_device *pdd;
> -	uint64_t pd_base;
>   	uint64_t eviction_duration;
>   	int retval = 0;
>   
>   	pdd = qpd_to_pdd(qpd);
> -	/* Retrieve PD base */
> -	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
>   
>   	dqm_lock(dqm);
>   	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
> @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   		goto out;
>   	}
>   
> +	/* The debugger creates processes that temporarily have not acquired
> +	 * all VMs for all devices and has no VMs itself.
> +	 * Skip queue restore on process restore.
> +	 */
> +	if (!pdd->drm_priv)
> +		goto out;
> +

I had a comment here that "qpd->evicted = 0;" was duplicated. It is 
still needed in this case. Otherwise the process will end up being 
created with all queues in an evicted state and no way to execute 
anything on the GPU.

You only need one instance of "qpd->evicted = 0;", but it needs to be in 
the right place (after the vm_not_acquired label you had in v1 of this 
patch).

Regards,
   Felix


>   	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>   			    pdd->process->pasid);
>   
>   	/* Update PD Base in QPD */
> -	qpd->page_table_base = pd_base;
> -	pr_debug("Updated PD address to 0x%llx\n", pd_base);
> +	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> +	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
>   
>   	/* activate all active queues on the qpd */
>   	list_for_each_entry(q, &qpd->queues_list, list) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index bfa30d12406b..62b75ba28425 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -886,19 +886,48 @@ struct kfd_process {
>   	 */
>   	unsigned long last_restore_timestamp;
>   
> +	/* Indicates device process is debug attached with reserved vmid. */
> +	bool debug_trap_enabled;
> +
> +	/* per-process-per device debug event fd file */
> +	struct file *dbg_ev_file;
> +
> +	/* If the process is a kfd debugger, we need to know so we can clean
> +	 * up at exit time.  If a process enables debugging on itself, it does
> +	 * its own clean-up, so we don't set the flag here.  We track this by
> +	 * counting the number of processes this process is debugging.
> +	 */
> +	atomic_t debugged_process_count;
> +
> +	/* If the process is a debugged, this is the debugger process */
> +	struct kfd_process *debugger_process;
> +
>   	/* Kobj for our procfs */
>   	struct kobject *kobj;
>   	struct kobject *kobj_queues;
>   	struct attribute attr_pasid;
>   
> +	/* Keep track cwsr init */
> +	bool has_cwsr;
> +
> +	/* Exception code enable mask and status */
> +	uint64_t exception_enable_mask;
> +
>   	/* shared virtual memory registered by this process */
>   	struct svm_range_list svms;
>   
>   	bool xnack_enabled;
>   
> +	/* Work area for debugger event writer worker. */
> +	struct work_struct debug_event_workarea;
> +
>   	atomic_t poison;
>   	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
>   	bool queues_paused;
> +
> +	/* Tracks runtime enable status */
> +	struct kfd_runtime_info runtime_info;
> +
>   };
>   
>   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
>   
>   int kfd_process_create_wq(void);
>   void kfd_process_destroy_wq(void);
> -struct kfd_process *kfd_create_process(struct file *filep);
> +struct kfd_process *kfd_create_process(struct task_struct *thread);
>   struct kfd_process *kfd_get_process(const struct task_struct *task);
>   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
>   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
> @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr);
>   
> +/* CWSR initialization */
> +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
> +
>   /* CRIU */
>   /*
>    * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 72df6286e240..e935158ab311 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -44,6 +44,7 @@ struct mm_struct;
>   #include "kfd_iommu.h"
>   #include "kfd_svm.h"
>   #include "kfd_smi_events.h"
> +#include "kfd_debug.h"
>   
>   /*
>    * List of struct kfd_process (field kfd_process).
> @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct task_struct *thread,
>   					bool ref);
>   static void kfd_process_ref_release(struct kref *ref);
>   static struct kfd_process *create_process(const struct task_struct *thread);
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>   
>   static void evict_process_worker(struct work_struct *work);
>   static void restore_process_worker(struct work_struct *work);
> @@ -798,18 +798,19 @@ static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
>   	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
>   }
>   
> -struct kfd_process *kfd_create_process(struct file *filep)
> +struct kfd_process *kfd_create_process(struct task_struct *thread)
>   {
>   	struct kfd_process *process;
> -	struct task_struct *thread = current;
>   	int ret;
>   
> -	if (!thread->mm)
> +	if (!(thread->mm && mmget_not_zero(thread->mm)))
>   		return ERR_PTR(-EINVAL);
>   
>   	/* Only the pthreads threading model is supported. */
> -	if (thread->group_leader->mm != thread->mm)
> +	if (thread->group_leader->mm != thread->mm) {
> +		mmput(thread->mm);
>   		return ERR_PTR(-EINVAL);
> +	}
>   
>   	/*
>   	 * take kfd processes mutex before starting of process creation
> @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   		if (IS_ERR(process))
>   			goto out;
>   
> -		ret = kfd_process_init_cwsr_apu(process, filep);
> -		if (ret)
> -			goto out_destroy;
> -
>   		if (!procfs.kobj)
>   			goto out;
>   
> @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   	if (!IS_ERR(process))
>   		kref_get(&process->ref);
>   	mutex_unlock(&kfd_processes_mutex);
> +	mmput(thread->mm);
>   
>   	return process;
> -
> -out_destroy:
> -	hash_del_rcu(&process->kfd_processes);
> -	mutex_unlock(&kfd_processes_mutex);
> -	synchronize_srcu(&kfd_processes_srcu);
> -	/* kfd_process_free_notifier will trigger the cleanup */
> -	mmu_notifier_put(&process->mmu_notifier);
> -	return ERR_PTR(ret);
>   }
>   
>   struct kfd_process *kfd_get_process(const struct task_struct *thread)
> @@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct work_struct *work)
>   	struct kfd_process *p = container_of(work, struct kfd_process,
>   					     release_work);
>   
> +	kfd_dbg_trap_disable(p);
> +
> +	if (atomic_read(&p->debugged_process_count) > 0) {
> +		struct kfd_process *target;
> +		unsigned int temp;
> +		int idx = srcu_read_lock(&kfd_processes_srcu);
> +
> +		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
> +			if (target->debugger_process && target->debugger_process == p) {
> +				mutex_lock(&target->mutex);
> +				kfd_dbg_trap_disable(target);
> +				mutex_unlock(&target->mutex);
> +				if (atomic_read(&p->debugged_process_count) == 0)
> +					break;
> +			}
> +		}
> +
> +		srcu_read_unlock(&kfd_processes_srcu, idx);
> +	}
> +
>   	kfd_process_dequeue_from_all_devices(p);
>   	pqm_uninit(&p->pqm);
>   
> @@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
>   	.free_notifier = kfd_process_free_notifier,
>   };
>   
> -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   {
>   	unsigned long  offset;
>   	int i;
>   
> +	if (p->has_cwsr)
> +		return 0;
> +
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_dev *dev = p->pdds[i]->dev;
>   		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> @@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
>   			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>   	}
>   
> +	p->has_cwsr = true;
> +
>   	return 0;
>   }
>   
> @@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	if (err)
>   		goto err_event_init;
>   	process->is_32bit_user_mode = in_compat_syscall();
> +	process->debug_trap_enabled = false;
> +	process->debugger_process = NULL;
> +	process->exception_enable_mask = 0;
> +	atomic_set(&process->debugged_process_count, 0);
>   
>   	process->pasid = kfd_pasid_alloc();
>   	if (process->pasid == 0) {
> @@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
>   	kfd_unref_process(process);
>   	get_task_struct(process->lead_thread);
>   
> +	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
> +
>   	return process;
>   
>   err_register_notifier:
> @@ -1908,8 +1929,10 @@ static void restore_process_worker(struct work_struct *work)
>   	 */
>   
>   	p->last_restore_timestamp = get_jiffies_64();
> -	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> -						     &p->ef);
> +	/* VMs may not have been acquired yet during debugging. */
> +	if (p->kgd_process_info)
> +		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
> +							     &p->ef);
>   	if (ret) {
>   		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
>   			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
Kim, Jonathan March 23, 2023, 7:12 p.m. UTC | #2
[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@amd.com>
> Sent: Thursday, February 16, 2023 6:44 PM
> To: Kim, Jonathan <Jonathan.Kim@amd.com>; amd-
> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: Re: [PATCH 03/32] drm/amdkfd: prepare per-process debug enable
> and disable
>
>
> On 2023-01-25 14:53, Jonathan Kim wrote:
> > The ROCm debugger will attach to a process to debug by PTRACE and will
> > expect the KFD to prepare a process for the target PID, whether the
> > target PID has opened the KFD device or not.
> >
> > This patch is to explicity handle this requirement.  Further HW mode
> > setting and runtime coordination requirements will be handled in
> > following patches.
> >
> > In the case where the target process has not opened the KFD device,
> > a new KFD process must be created for the target PID.
> > The debugger as well as the target process for this case will have not
> > acquired any VMs so handle process restoration to correctly account for
> > this.
> >
> > To coordinate with HSA runtime, the debugger must be aware of the target
> > process' runtime enablement status and will copy the runtime status
> > information into the debugged KFD process for later query.
> >
> > On enablement, the debugger will subscribe to a set of exceptions where
> > each exception events will notify the debugger through a pollable FIFO
> > file descriptor that the debugger provides to the KFD to manage.
> > Some events will be synchronously raised while other are scheduled,
> > which is why a debug_event_workarea worker is initialized.
> >
> > Finally on process termination of either the debugger or the target,
> > debugging must be disabled if it has not been done so.
> >
> > v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
> > remove unnecessary queue eviction counter reset when there's nothing
> > to evict.
> > change err code to EALREADY if attaching to an already attached process.
> > move debug disable to release worker to avoid race with disable from
> > ioctl call.
> >
> > v2: relax debug trap disable and PTRACE ATTACH requirement.
> >
> > Signed-off-by: Jonathan Kim<jonathan.kim@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/Makefile           |  3 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 88 ++++++++++++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 94
> +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        | 33 +++++++
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 34 ++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 63 +++++++++----
> >   7 files changed, 308 insertions(+), 29 deletions(-)
> >   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> >   create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile
> b/drivers/gpu/drm/amd/amdkfd/Makefile
> > index e758c2a24cd0..747754428073 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/Makefile
> > +++ b/drivers/gpu/drm/amd/amdkfd/Makefile
> > @@ -55,7 +55,8 @@ AMDKFD_FILES      := $(AMDKFD_PATH)/kfd_module.o \
> >             $(AMDKFD_PATH)/kfd_int_process_v9.o \
> >             $(AMDKFD_PATH)/kfd_int_process_v11.o \
> >             $(AMDKFD_PATH)/kfd_smi_events.o \
> > -           $(AMDKFD_PATH)/kfd_crat.o
> > +           $(AMDKFD_PATH)/kfd_crat.o \
> > +           $(AMDKFD_PATH)/kfd_debug.o
> >
> >   ifneq ($(CONFIG_AMD_IOMMU_V2),)
> >   AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index d3b019e64093..ee05c2e54ef6 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -44,6 +44,7 @@
> >   #include "amdgpu_amdkfd.h"
> >   #include "kfd_smi_events.h"
> >   #include "amdgpu_dma_buf.h"
> > +#include "kfd_debug.h"
> >
> >   static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> >   static int kfd_open(struct inode *, struct file *);
> > @@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct
> file *filep)
> >             return -EPERM;
> >     }
> >
> > -   process = kfd_create_process(filep);
> > +   process = kfd_create_process(current);
> >     if (IS_ERR(process))
> >             return PTR_ERR(process);
> >
> > +   if (kfd_process_init_cwsr_apu(process, filep)) {
> > +           kfd_unref_process(process);
> > +           return -EFAULT;
> > +   }
> > +
> >     if (kfd_is_locked()) {
> >             dev_dbg(kfd_device, "kfd is locked!\n"
> >                             "process %d unreferenced", process->pasid);
> > @@ -2653,6 +2659,9 @@ static int kfd_ioctl_runtime_enable(struct file
> *filep, struct kfd_process *p, v
> >   static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process
> *p, void *data)
> >   {
> >     struct kfd_ioctl_dbg_trap_args *args = data;
> > +   struct task_struct *thread = NULL;
> > +   struct pid *pid = NULL;
> > +   struct kfd_process *target = NULL;
> >     int r = 0;
> >
> >     if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
> > @@ -2660,9 +2669,71 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >             return -EINVAL;
> >     }
> >
> > +   pid = find_get_pid(args->pid);
> > +   if (!pid) {
> > +           pr_debug("Cannot find pid info for %i\n", args->pid);
> > +           r = -ESRCH;
> > +           goto out;
> > +   }
> > +
> > +   thread = get_pid_task(pid, PIDTYPE_PID);
> > +
> > +   if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
> > +           bool create_process;
> > +
> > +           rcu_read_lock();
> > +           create_process = thread && thread != current &&
> ptrace_parent(thread) == current;
> > +           rcu_read_unlock();
> > +
> > +           target = create_process ? kfd_create_process(thread) :
> > +                                   kfd_lookup_process_by_pid(pid);
> > +   } else {
> > +           target = kfd_lookup_process_by_pid(pid);
> > +   }
> > +
> > +   if (!target) {
> > +           pr_debug("Cannot find process PID %i to debug\n", args-
> >pid);
> > +           r = -ESRCH;
> > +           goto out;
> > +   }
> > +
> > +   /* Check if target is still PTRACED. */
> > +   rcu_read_lock();
> > +   if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
> > +                           && ptrace_parent(target->lead_thread) !=
> current) {
> > +           pr_err("PID %i is not PTRACED and cannot be debugged\n",
> args->pid);
> > +           r = -EPERM;
> > +   }
> > +   rcu_read_unlock();
> > +
> > +   if (r)
> > +           goto out;
> > +
> > +   mutex_lock(&target->mutex);
> > +
> > +   if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target-
> >debug_trap_enabled) {
> > +           pr_err("PID %i not debug enabled for op %i\n", args->pid,
> args->op);
> > +           r = -EINVAL;
> > +           goto unlock_out;
> > +   }
> > +
> >     switch (args->op) {
> >     case KFD_IOC_DBG_TRAP_ENABLE:
> > +           if (target != p)
> > +                   target->debugger_process = p;
> > +
> > +           r = kfd_dbg_trap_enable(target,
> > +                                   args->enable.dbg_fd,
> > +                                   (void __user *)args->enable.rinfo_ptr,
> > +                                   &args->enable.rinfo_size);
> > +           if (!r)
> > +                   target->exception_enable_mask = args-
> >enable.exception_mask;
> > +
> > +           pr_warn("Debug functions limited\n");
> > +           break;
> >     case KFD_IOC_DBG_TRAP_DISABLE:
> > +           r = kfd_dbg_trap_disable(target);
> > +           break;
> >     case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
> >     case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
> >     case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
> > @@ -2676,7 +2747,7 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >     case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
> >     case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
> >     case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> > -           pr_warn("Debugging not supported yet\n");
> > +           pr_warn("Debug op %i not supported yet\n", args->op);
> >             r = -EACCES;
> >             break;
> >     default:
> > @@ -2684,6 +2755,19 @@ static int kfd_ioctl_set_debug_trap(struct file
> *filep, struct kfd_process *p, v
> >             r = -EINVAL;
> >     }
> >
> > +unlock_out:
> > +   mutex_unlock(&target->mutex);
> > +
> > +out:
> > +   if (thread)
> > +           put_task_struct(thread);
> > +
> > +   if (pid)
> > +           put_pid(pid);
> > +
> > +   if (target)
> > +           kfd_unref_process(target);
> > +
> >     return r;
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > new file mode 100644
> > index 000000000000..f6ea6db266b4
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > @@ -0,0 +1,94 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#include "kfd_debug.h"
> > +#include <linux/file.h>
> > +
> > +void debug_event_write_work_handler(struct work_struct *work)
> > +{
> > +   struct kfd_process *process;
> > +
> > +   static const char write_data = '.';
> > +   loff_t pos = 0;
> > +
> > +   process = container_of(work,
> > +                   struct kfd_process,
> > +                   debug_event_workarea);
> > +
> > +   kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
> > +}
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target)
> > +{
> > +   if (!target->debug_trap_enabled)
> > +           return 0;
> > +
> > +   fput(target->dbg_ev_file);
> > +   target->dbg_ev_file = NULL;
> > +
> > +   if (target->debugger_process) {
> > +           atomic_dec(&target->debugger_process-
> >debugged_process_count);
> > +           target->debugger_process = NULL;
> > +   }
> > +
> > +   target->debug_trap_enabled = false;
> > +   kfd_unref_process(target);
> > +
> > +   return 0;
> > +}
> > +
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > +                   void __user *runtime_info, uint32_t *runtime_size)
> > +{
> > +   struct file *f;
> > +   uint32_t copy_size;
> > +   int r = 0;
> > +
> > +   if (target->debug_trap_enabled)
> > +           return -EALREADY;
> > +
> > +   copy_size = min((size_t)(*runtime_size), sizeof(target-
> >runtime_info));
> > +
> > +   f = fget(fd);
> > +   if (!f) {
> > +           pr_err("Failed to get file for (%i)\n", fd);
> > +           return -EBADF;
> > +   }
> > +
> > +   target->dbg_ev_file = f;
> > +
> > +   /* We already hold the process reference but hold another one for
> the
> > +    * debug session.
> > +    */
> > +   kref_get(&target->ref);
> > +   target->debug_trap_enabled = true;
> > +
> > +   if (target->debugger_process)
> > +           atomic_inc(&target->debugger_process-
> >debugged_process_count);
> > +
> > +   if (copy_to_user(runtime_info, (void *)&target->runtime_info,
> copy_size))
> > +           r = -EFAULT;
> > +
> > +   *runtime_size = sizeof(target->runtime_info);
> > +
> > +   return r;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > new file mode 100644
> > index 000000000000..b2217eb1399c
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> > @@ -0,0 +1,33 @@
> > +/*
> > + * Copyright 2022 Advanced Micro Devices, Inc.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the
> "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included
> in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> OTHERWISE,
> > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
> THE USE OR
> > + * OTHER DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
> > +#define KFD_DEBUG_EVENTS_H_INCLUDED
> > +
> > +#include "kfd_priv.h"
> > +
> > +int kfd_dbg_trap_disable(struct kfd_process *target);
> > +int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
> > +                   void __user *runtime_info,
> > +                   uint32_t *runtime_info_size);
> > +void debug_event_write_work_handler(struct work_struct *work);
> > +#endif
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index c06ada0844ba..a2ac98d06e71 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >             goto out;
> >
> >     pdd = qpd_to_pdd(qpd);
> > +
> > +   /* The debugger creates processes that temporarily have not
> acquired
> > +    * all VMs for all devices and has no VMs itself.
> > +    * Skip queue eviction on process eviction.
> > +    */
> > +   if (!pdd->drm_priv)
> > +           goto out;
> > +
> This should be before qpd->

Sorry I didn't quite catch what you were saying here (did your comment get cutoff?).
Did you mean the pdd->drm_priv check needs to go before the if (qpd->evicted++ > 0) /* already evicted, do nothing */ check?

Thanks,

Jon

> >     pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
> >                         pdd->process->pasid);
> >
> > @@ -1100,13 +1108,10 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >   {
> >     struct queue *q;
> >     struct kfd_process_device *pdd;
> > -   uint64_t pd_base;
> >     uint64_t eviction_duration;
> >     int retval = 0;
> >
> >     pdd = qpd_to_pdd(qpd);
> > -   /* Retrieve PD base */
> > -   pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd-
> >drm_priv);
> >
> >     dqm_lock(dqm);
> >     if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing
> */
> > @@ -1116,12 +1121,19 @@ static int restore_process_queues_cpsch(struct
> device_queue_manager *dqm,
> >             goto out;
> >     }
> >
> > +   /* The debugger creates processes that temporarily have not
> acquired
> > +    * all VMs for all devices and has no VMs itself.
> > +    * Skip queue restore on process restore.
> > +    */
> > +   if (!pdd->drm_priv)
> > +           goto out;
> > +
>
> I had a comment here that "qpd->evicted = 0;" was duplicated. It is
> still needed in this case. Otherwise the process will end up being
> created with all queues in an evicted state and no way to execute
> anything on the GPU.
>
> You only need one instance of "qpd->evicted = 0;", but it needs to be in
> the right place (after the vm_not_acquired label you had in v1 of this
> patch).
>
> Regards,
>    Felix
>
>
> >     pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
> >                         pdd->process->pasid);
> >
> >     /* Update PD Base in QPD */
> > -   qpd->page_table_base = pd_base;
> > -   pr_debug("Updated PD address to 0x%llx\n", pd_base);
> > +   qpd->page_table_base =
> amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
> > +   pr_debug("Updated PD address to 0x%llx\n", qpd-
> >page_table_base);
> >
> >     /* activate all active queues on the qpd */
> >     list_for_each_entry(q, &qpd->queues_list, list) {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index bfa30d12406b..62b75ba28425 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -886,19 +886,48 @@ struct kfd_process {
> >      */
> >     unsigned long last_restore_timestamp;
> >
> > +   /* Indicates device process is debug attached with reserved vmid. */
> > +   bool debug_trap_enabled;
> > +
> > +   /* per-process-per device debug event fd file */
> > +   struct file *dbg_ev_file;
> > +
> > +   /* If the process is a kfd debugger, we need to know so we can clean
> > +    * up at exit time.  If a process enables debugging on itself, it does
> > +    * its own clean-up, so we don't set the flag here.  We track this by
> > +    * counting the number of processes this process is debugging.
> > +    */
> > +   atomic_t debugged_process_count;
> > +
> > +   /* If the process is a debugged, this is the debugger process */
> > +   struct kfd_process *debugger_process;
> > +
> >     /* Kobj for our procfs */
> >     struct kobject *kobj;
> >     struct kobject *kobj_queues;
> >     struct attribute attr_pasid;
> >
> > +   /* Keep track cwsr init */
> > +   bool has_cwsr;
> > +
> > +   /* Exception code enable mask and status */
> > +   uint64_t exception_enable_mask;
> > +
> >     /* shared virtual memory registered by this process */
> >     struct svm_range_list svms;
> >
> >     bool xnack_enabled;
> >
> > +   /* Work area for debugger event writer worker. */
> > +   struct work_struct debug_event_workarea;
> > +
> >     atomic_t poison;
> >     /* Queues are in paused stated because we are in the process of
> doing a CRIU checkpoint */
> >     bool queues_paused;
> > +
> > +   /* Tracks runtime enable status */
> > +   struct kfd_runtime_info runtime_info;
> > +
> >   };
> >
> >   #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
> > @@ -928,7 +957,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev);
> >
> >   int kfd_process_create_wq(void);
> >   void kfd_process_destroy_wq(void);
> > -struct kfd_process *kfd_create_process(struct file *filep);
> > +struct kfd_process *kfd_create_process(struct task_struct *thread);
> >   struct kfd_process *kfd_get_process(const struct task_struct *task);
> >   struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
> >   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct
> *mm);
> > @@ -1055,6 +1084,9 @@ void kfd_process_set_trap_handler(struct
> qcm_process_device *qpd,
> >                               uint64_t tba_addr,
> >                               uint64_t tma_addr);
> >
> > +/* CWSR initialization */
> > +int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file
> *filep);
> > +
> >   /* CRIU */
> >   /*
> >    * Need to increment KFD_CRIU_PRIV_VERSION each time a change is
> made to any of the CRIU private
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > index 72df6286e240..e935158ab311 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> > @@ -44,6 +44,7 @@ struct mm_struct;
> >   #include "kfd_iommu.h"
> >   #include "kfd_svm.h"
> >   #include "kfd_smi_events.h"
> > +#include "kfd_debug.h"
> >
> >   /*
> >    * List of struct kfd_process (field kfd_process).
> > @@ -69,7 +70,6 @@ static struct kfd_process *find_process(const struct
> task_struct *thread,
> >                                     bool ref);
> >   static void kfd_process_ref_release(struct kref *ref);
> >   static struct kfd_process *create_process(const struct task_struct
> *thread);
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep);
> >
> >   static void evict_process_worker(struct work_struct *work);
> >   static void restore_process_worker(struct work_struct *work);
> > @@ -798,18 +798,19 @@ static void
> kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
> >     kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
> >   }
> >
> > -struct kfd_process *kfd_create_process(struct file *filep)
> > +struct kfd_process *kfd_create_process(struct task_struct *thread)
> >   {
> >     struct kfd_process *process;
> > -   struct task_struct *thread = current;
> >     int ret;
> >
> > -   if (!thread->mm)
> > +   if (!(thread->mm && mmget_not_zero(thread->mm)))
> >             return ERR_PTR(-EINVAL);
> >
> >     /* Only the pthreads threading model is supported. */
> > -   if (thread->group_leader->mm != thread->mm)
> > +   if (thread->group_leader->mm != thread->mm) {
> > +           mmput(thread->mm);
> >             return ERR_PTR(-EINVAL);
> > +   }
> >
> >     /*
> >      * take kfd processes mutex before starting of process creation
> > @@ -827,10 +828,6 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> >             if (IS_ERR(process))
> >                     goto out;
> >
> > -           ret = kfd_process_init_cwsr_apu(process, filep);
> > -           if (ret)
> > -                   goto out_destroy;
> > -
> >             if (!procfs.kobj)
> >                     goto out;
> >
> > @@ -864,16 +861,9 @@ struct kfd_process *kfd_create_process(struct file
> *filep)
> >     if (!IS_ERR(process))
> >             kref_get(&process->ref);
> >     mutex_unlock(&kfd_processes_mutex);
> > +   mmput(thread->mm);
> >
> >     return process;
> > -
> > -out_destroy:
> > -   hash_del_rcu(&process->kfd_processes);
> > -   mutex_unlock(&kfd_processes_mutex);
> > -   synchronize_srcu(&kfd_processes_srcu);
> > -   /* kfd_process_free_notifier will trigger the cleanup */
> > -   mmu_notifier_put(&process->mmu_notifier);
> > -   return ERR_PTR(ret);
> >   }
> >
> >   struct kfd_process *kfd_get_process(const struct task_struct *thread)
> > @@ -1115,6 +1105,26 @@ static void kfd_process_wq_release(struct
> work_struct *work)
> >     struct kfd_process *p = container_of(work, struct kfd_process,
> >                                          release_work);
> >
> > +   kfd_dbg_trap_disable(p);
> > +
> > +   if (atomic_read(&p->debugged_process_count) > 0) {
> > +           struct kfd_process *target;
> > +           unsigned int temp;
> > +           int idx = srcu_read_lock(&kfd_processes_srcu);
> > +
> > +           hash_for_each_rcu(kfd_processes_table, temp, target,
> kfd_processes) {
> > +                   if (target->debugger_process && target-
> >debugger_process == p) {
> > +                           mutex_lock(&target->mutex);
> > +                           kfd_dbg_trap_disable(target);
> > +                           mutex_unlock(&target->mutex);
> > +                           if (atomic_read(&p-
> >debugged_process_count) == 0)
> > +                                   break;
> > +                   }
> > +           }
> > +
> > +           srcu_read_unlock(&kfd_processes_srcu, idx);
> > +   }
> > +
> >     kfd_process_dequeue_from_all_devices(p);
> >     pqm_uninit(&p->pqm);
> >
> > @@ -1200,11 +1210,14 @@ static const struct mmu_notifier_ops
> kfd_process_mmu_notifier_ops = {
> >     .free_notifier = kfd_process_free_notifier,
> >   };
> >
> > -static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file
> *filep)
> > +int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
> >   {
> >     unsigned long  offset;
> >     int i;
> >
> > +   if (p->has_cwsr)
> > +           return 0;
> > +
> >     for (i = 0; i < p->n_pdds; i++) {
> >             struct kfd_dev *dev = p->pdds[i]->dev;
> >             struct qcm_process_device *qpd = &p->pdds[i]->qpd;
> > @@ -1233,6 +1246,8 @@ static int kfd_process_init_cwsr_apu(struct
> kfd_process *p, struct file *filep)
> >                     qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> >     }
> >
> > +   p->has_cwsr = true;
> > +
> >     return 0;
> >   }
> >
> > @@ -1375,6 +1390,10 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> >     if (err)
> >             goto err_event_init;
> >     process->is_32bit_user_mode = in_compat_syscall();
> > +   process->debug_trap_enabled = false;
> > +   process->debugger_process = NULL;
> > +   process->exception_enable_mask = 0;
> > +   atomic_set(&process->debugged_process_count, 0);
> >
> >     process->pasid = kfd_pasid_alloc();
> >     if (process->pasid == 0) {
> > @@ -1422,6 +1441,8 @@ static struct kfd_process *create_process(const
> struct task_struct *thread)
> >     kfd_unref_process(process);
> >     get_task_struct(process->lead_thread);
> >
> > +   INIT_WORK(&process->debug_event_workarea,
> debug_event_write_work_handler);
> > +
> >     return process;
> >
> >   err_register_notifier:
> > @@ -1908,8 +1929,10 @@ static void restore_process_worker(struct
> work_struct *work)
> >      */
> >
> >     p->last_restore_timestamp = get_jiffies_64();
> > -   ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > -                                                &p->ef);
> > +   /* VMs may not have been acquired yet during debugging. */
> > +   if (p->kgd_process_info)
> > +           ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p-
> >kgd_process_info,
> > +                                                        &p->ef);
> >     if (ret) {
> >             pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d
> ms\n",
> >                      p->pasid, PROCESS_BACK_OFF_TIME_MS);
Felix Kuehling March 23, 2023, 8:08 p.m. UTC | #3
Sorry, I think that was just a stray comment that I messed up while 
editing my response. You can ignore it.

Regards,
   Felix


Am 2023-03-23 um 15:12 schrieb Kim, Jonathan:
>>> index c06ada0844ba..a2ac98d06e71 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -979,6 +979,14 @@ static int evict_process_queues_cpsch(struct
>> device_queue_manager *dqm,
>>>              goto out;
>>>
>>>      pdd = qpd_to_pdd(qpd);
>>> +
>>> +   /* The debugger creates processes that temporarily have not
>> acquired
>>> +    * all VMs for all devices and has no VMs itself.
>>> +    * Skip queue eviction on process eviction.
>>> +    */
>>> +   if (!pdd->drm_priv)
>>> +           goto out;
>>> +
>> This should be before qpd->
> Sorry I didn't quite catch what you were saying here (did your comment get cutoff?).
> Did you mean the pdd->drm_priv check needs to go before the if (qpd->evicted++ > 0) /* already evicted, do nothing */ check?
>
> Thanks,
>
> Jon
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index e758c2a24cd0..747754428073 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -55,7 +55,8 @@  AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_int_process_v9.o \
 		$(AMDKFD_PATH)/kfd_int_process_v11.o \
 		$(AMDKFD_PATH)/kfd_smi_events.o \
-		$(AMDKFD_PATH)/kfd_crat.o
+		$(AMDKFD_PATH)/kfd_crat.o \
+		$(AMDKFD_PATH)/kfd_debug.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d3b019e64093..ee05c2e54ef6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -44,6 +44,7 @@ 
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@  static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
-	process = kfd_create_process(filep);
+	process = kfd_create_process(current);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
+	if (kfd_process_init_cwsr_apu(process, filep)) {
+		kfd_unref_process(process);
+		return -EFAULT;
+	}
+
 	if (kfd_is_locked()) {
 		dev_dbg(kfd_device, "kfd is locked!\n"
 				"process %d unreferenced", process->pasid);
@@ -2653,6 +2659,9 @@  static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, v
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data)
 {
 	struct kfd_ioctl_dbg_trap_args *args = data;
+	struct task_struct *thread = NULL;
+	struct pid *pid = NULL;
+	struct kfd_process *target = NULL;
 	int r = 0;
 
 	if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2660,9 +2669,71 @@  static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		return -EINVAL;
 	}
 
+	pid = find_get_pid(args->pid);
+	if (!pid) {
+		pr_debug("Cannot find pid info for %i\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	thread = get_pid_task(pid, PIDTYPE_PID);
+
+	if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
+		bool create_process;
+
+		rcu_read_lock();
+		create_process = thread && thread != current && ptrace_parent(thread) == current;
+		rcu_read_unlock();
+
+		target = create_process ? kfd_create_process(thread) :
+					kfd_lookup_process_by_pid(pid);
+	} else {
+		target = kfd_lookup_process_by_pid(pid);
+	}
+
+	if (!target) {
+		pr_debug("Cannot find process PID %i to debug\n", args->pid);
+		r = -ESRCH;
+		goto out;
+	}
+
+	/* Check if target is still PTRACED. */
+	rcu_read_lock();
+	if (target != p && args->op != KFD_IOC_DBG_TRAP_DISABLE
+				&& ptrace_parent(target->lead_thread) != current) {
+		pr_err("PID %i is not PTRACED and cannot be debugged\n", args->pid);
+		r = -EPERM;
+	}
+	rcu_read_unlock();
+
+	if (r)
+		goto out;
+
+	mutex_lock(&target->mutex);
+
+	if (args->op != KFD_IOC_DBG_TRAP_ENABLE && !target->debug_trap_enabled) {
+		pr_err("PID %i not debug enabled for op %i\n", args->pid, args->op);
+		r = -EINVAL;
+		goto unlock_out;
+	}
+
 	switch (args->op) {
 	case KFD_IOC_DBG_TRAP_ENABLE:
+		if (target != p)
+			target->debugger_process = p;
+
+		r = kfd_dbg_trap_enable(target,
+					args->enable.dbg_fd,
+					(void __user *)args->enable.rinfo_ptr,
+					&args->enable.rinfo_size);
+		if (!r)
+			target->exception_enable_mask = args->enable.exception_mask;
+
+		pr_warn("Debug functions limited\n");
+		break;
 	case KFD_IOC_DBG_TRAP_DISABLE:
+		r = kfd_dbg_trap_disable(target);
+		break;
 	case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
 	case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
 	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
@@ -2676,7 +2747,7 @@  static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 	case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
 	case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debugging not supported yet\n");
+		pr_warn("Debug op %i not supported yet\n", args->op);
 		r = -EACCES;
 		break;
 	default:
@@ -2684,6 +2755,19 @@  static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 		r = -EINVAL;
 	}
 
+unlock_out:
+	mutex_unlock(&target->mutex);
+
+out:
+	if (thread)
+		put_task_struct(thread);
+
+	if (pid)
+		put_pid(pid);
+
+	if (target)
+		kfd_unref_process(target);
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
new file mode 100644
index 000000000000..f6ea6db266b4
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -0,0 +1,94 @@ 
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "kfd_debug.h"
+#include <linux/file.h>
+
+void debug_event_write_work_handler(struct work_struct *work)
+{
+	struct kfd_process *process;
+
+	static const char write_data = '.';
+	loff_t pos = 0;
+
+	process = container_of(work,
+			struct kfd_process,
+			debug_event_workarea);
+
+	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
+}
+
+int kfd_dbg_trap_disable(struct kfd_process *target)
+{
+	if (!target->debug_trap_enabled)
+		return 0;
+
+	fput(target->dbg_ev_file);
+	target->dbg_ev_file = NULL;
+
+	if (target->debugger_process) {
+		atomic_dec(&target->debugger_process->debugged_process_count);
+		target->debugger_process = NULL;
+	}
+
+	target->debug_trap_enabled = false;
+	kfd_unref_process(target);
+
+	return 0;
+}
+
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info, uint32_t *runtime_size)
+{
+	struct file *f;
+	uint32_t copy_size;
+	int r = 0;
+
+	if (target->debug_trap_enabled)
+		return -EALREADY;
+
+	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
+
+	f = fget(fd);
+	if (!f) {
+		pr_err("Failed to get file for (%i)\n", fd);
+		return -EBADF;
+	}
+
+	target->dbg_ev_file = f;
+
+	/* We already hold the process reference but hold another one for the
+	 * debug session.
+	 */
+	kref_get(&target->ref);
+	target->debug_trap_enabled = true;
+
+	if (target->debugger_process)
+		atomic_inc(&target->debugger_process->debugged_process_count);
+
+	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+		r = -EFAULT;
+
+	*runtime_size = sizeof(target->runtime_info);
+
+	return r;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
new file mode 100644
index 000000000000..b2217eb1399c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -0,0 +1,33 @@ 
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_DEBUG_EVENTS_H_INCLUDED
+#define KFD_DEBUG_EVENTS_H_INCLUDED
+
+#include "kfd_priv.h"
+
+int kfd_dbg_trap_disable(struct kfd_process *target);
+int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
+			void __user *runtime_info,
+			uint32_t *runtime_info_size);
+void debug_event_write_work_handler(struct work_struct *work);
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c06ada0844ba..a2ac98d06e71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -979,6 +979,14 @@  static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
+
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue eviction on process eviction.
+	 */
+	if (!pdd->drm_priv)
+		goto out;
+
 	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
@@ -1100,13 +1108,10 @@  static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 {
 	struct queue *q;
 	struct kfd_process_device *pdd;
-	uint64_t pd_base;
 	uint64_t eviction_duration;
 	int retval = 0;
 
 	pdd = qpd_to_pdd(qpd);
-	/* Retrieve PD base */
-	pd_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
 
 	dqm_lock(dqm);
 	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
@@ -1116,12 +1121,19 @@  static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
+	/* The debugger creates processes that temporarily have not acquired
+	 * all VMs for all devices and has no VMs itself.
+	 * Skip queue restore on process restore.
+	 */
+	if (!pdd->drm_priv)
+		goto out;
+
 	pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
-	qpd->page_table_base = pd_base;
-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
+	qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
+	pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
 
 	/* activate all active queues on the qpd */
 	list_for_each_entry(q, &qpd->queues_list, list) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index bfa30d12406b..62b75ba28425 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -886,19 +886,48 @@  struct kfd_process {
 	 */
 	unsigned long last_restore_timestamp;
 
+	/* Indicates device process is debug attached with reserved vmid. */
+	bool debug_trap_enabled;
+
+	/* per-process-per device debug event fd file */
+	struct file *dbg_ev_file;
+
+	/* If the process is a kfd debugger, we need to know so we can clean
+	 * up at exit time.  If a process enables debugging on itself, it does
+	 * its own clean-up, so we don't set the flag here.  We track this by
+	 * counting the number of processes this process is debugging.
+	 */
+	atomic_t debugged_process_count;
+
+	/* If the process is a debugged, this is the debugger process */
+	struct kfd_process *debugger_process;
+
 	/* Kobj for our procfs */
 	struct kobject *kobj;
 	struct kobject *kobj_queues;
 	struct attribute attr_pasid;
 
+	/* Keep track cwsr init */
+	bool has_cwsr;
+
+	/* Exception code enable mask and status */
+	uint64_t exception_enable_mask;
+
 	/* shared virtual memory registered by this process */
 	struct svm_range_list svms;
 
 	bool xnack_enabled;
 
+	/* Work area for debugger event writer worker. */
+	struct work_struct debug_event_workarea;
+
 	atomic_t poison;
 	/* Queues are in paused stated because we are in the process of doing a CRIU checkpoint */
 	bool queues_paused;
+
+	/* Tracks runtime enable status */
+	struct kfd_runtime_info runtime_info;
+
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -928,7 +957,7 @@  bool kfd_dev_is_large_bar(struct kfd_dev *dev);
 
 int kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(struct file *filep);
+struct kfd_process *kfd_create_process(struct task_struct *thread);
 struct kfd_process *kfd_get_process(const struct task_struct *task);
 struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
 struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
@@ -1055,6 +1084,9 @@  void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr);
 
+/* CWSR initialization */
+int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
+
 /* CRIU */
 /*
  * Need to increment KFD_CRIU_PRIV_VERSION each time a change is made to any of the CRIU private
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 72df6286e240..e935158ab311 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -44,6 +44,7 @@  struct mm_struct;
 #include "kfd_iommu.h"
 #include "kfd_svm.h"
 #include "kfd_smi_events.h"
+#include "kfd_debug.h"
 
 /*
  * List of struct kfd_process (field kfd_process).
@@ -69,7 +70,6 @@  static struct kfd_process *find_process(const struct task_struct *thread,
 					bool ref);
 static void kfd_process_ref_release(struct kref *ref);
 static struct kfd_process *create_process(const struct task_struct *thread);
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
 
 static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
@@ -798,18 +798,19 @@  static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
 	kfd_process_free_gpuvm(qpd->ib_mem, pdd, &qpd->ib_kaddr);
 }
 
-struct kfd_process *kfd_create_process(struct file *filep)
+struct kfd_process *kfd_create_process(struct task_struct *thread)
 {
 	struct kfd_process *process;
-	struct task_struct *thread = current;
 	int ret;
 
-	if (!thread->mm)
+	if (!(thread->mm && mmget_not_zero(thread->mm)))
 		return ERR_PTR(-EINVAL);
 
 	/* Only the pthreads threading model is supported. */
-	if (thread->group_leader->mm != thread->mm)
+	if (thread->group_leader->mm != thread->mm) {
+		mmput(thread->mm);
 		return ERR_PTR(-EINVAL);
+	}
 
 	/*
 	 * take kfd processes mutex before starting of process creation
@@ -827,10 +828,6 @@  struct kfd_process *kfd_create_process(struct file *filep)
 		if (IS_ERR(process))
 			goto out;
 
-		ret = kfd_process_init_cwsr_apu(process, filep);
-		if (ret)
-			goto out_destroy;
-
 		if (!procfs.kobj)
 			goto out;
 
@@ -864,16 +861,9 @@  struct kfd_process *kfd_create_process(struct file *filep)
 	if (!IS_ERR(process))
 		kref_get(&process->ref);
 	mutex_unlock(&kfd_processes_mutex);
+	mmput(thread->mm);
 
 	return process;
-
-out_destroy:
-	hash_del_rcu(&process->kfd_processes);
-	mutex_unlock(&kfd_processes_mutex);
-	synchronize_srcu(&kfd_processes_srcu);
-	/* kfd_process_free_notifier will trigger the cleanup */
-	mmu_notifier_put(&process->mmu_notifier);
-	return ERR_PTR(ret);
 }
 
 struct kfd_process *kfd_get_process(const struct task_struct *thread)
@@ -1115,6 +1105,26 @@  static void kfd_process_wq_release(struct work_struct *work)
 	struct kfd_process *p = container_of(work, struct kfd_process,
 					     release_work);
 
+	kfd_dbg_trap_disable(p);
+
+	if (atomic_read(&p->debugged_process_count) > 0) {
+		struct kfd_process *target;
+		unsigned int temp;
+		int idx = srcu_read_lock(&kfd_processes_srcu);
+
+		hash_for_each_rcu(kfd_processes_table, temp, target, kfd_processes) {
+			if (target->debugger_process && target->debugger_process == p) {
+				mutex_lock(&target->mutex);
+				kfd_dbg_trap_disable(target);
+				mutex_unlock(&target->mutex);
+				if (atomic_read(&p->debugged_process_count) == 0)
+					break;
+			}
+		}
+
+		srcu_read_unlock(&kfd_processes_srcu, idx);
+	}
+
 	kfd_process_dequeue_from_all_devices(p);
 	pqm_uninit(&p->pqm);
 
@@ -1200,11 +1210,14 @@  static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.free_notifier = kfd_process_free_notifier,
 };
 
-static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 {
 	unsigned long  offset;
 	int i;
 
+	if (p->has_cwsr)
+		return 0;
+
 	for (i = 0; i < p->n_pdds; i++) {
 		struct kfd_dev *dev = p->pdds[i]->dev;
 		struct qcm_process_device *qpd = &p->pdds[i]->qpd;
@@ -1233,6 +1246,8 @@  static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
 			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
 	}
 
+	p->has_cwsr = true;
+
 	return 0;
 }
 
@@ -1375,6 +1390,10 @@  static struct kfd_process *create_process(const struct task_struct *thread)
 	if (err)
 		goto err_event_init;
 	process->is_32bit_user_mode = in_compat_syscall();
+	process->debug_trap_enabled = false;
+	process->debugger_process = NULL;
+	process->exception_enable_mask = 0;
+	atomic_set(&process->debugged_process_count, 0);
 
 	process->pasid = kfd_pasid_alloc();
 	if (process->pasid == 0) {
@@ -1422,6 +1441,8 @@  static struct kfd_process *create_process(const struct task_struct *thread)
 	kfd_unref_process(process);
 	get_task_struct(process->lead_thread);
 
+	INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
+
 	return process;
 
 err_register_notifier:
@@ -1908,8 +1929,10 @@  static void restore_process_worker(struct work_struct *work)
 	 */
 
 	p->last_restore_timestamp = get_jiffies_64();
-	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
-						     &p->ef);
+	/* VMs may not have been acquired yet during debugging. */
+	if (p->kgd_process_info)
+		ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
+							     &p->ef);
 	if (ret) {
 		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);