diff mbox series

[31/32] drm/amdkfd: add debug device snapshot operation

Message ID 20230125195401.4183544-32-jonathan.kim@amd.com (mailing list archive)
State New, archived
Headers show
Series Upstream of kernel support for AMDGPU ISA debugging | expand

Commit Message

Kim, Jonathan Jan. 25, 2023, 7:54 p.m. UTC
Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v3: was reviewed but re-requesting review with new revision and
subvendor information.
memset 0 device info entry to clear padding.

v2: change buf_size are to num_devices for more clarity.
expand device entry new members on copy.
fix minimum entry size calculation for queue and device snapshot.
change device snapshot implementation to match queue snapshot
implementation.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 82 insertions(+), 2 deletions(-)

Comments

Felix Kuehling March 22, 2023, 9:54 p.m. UTC | #1
Am 2023-01-25 um 14:54 schrieb Jonathan Kim:
> Similar to queue snapshot, return an array of device information using
> an entry_size check and return.
> Unlike queue snapshots, the debugger needs to pass to correct number of
> devices that exist.  If it fails to do so, the KFD will return the
> number of actual devices so that the debugger can make a subsequent
> successful call.
>
> v3: was reviewed but re-requesting review with new revision and
> subvendor information.
> memset 0 device info entry to clear padding.
>
> v2: change buf_size are to num_devices for more clarity.
> expand device entry new members on copy.
> fix minimum entry size calculation for queue and device snapshot.
> change device snapshot implementation to match queue snapshot
> implementation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 ++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
>   3 files changed, 82 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 93b288233577..da74a6ef4d9b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2972,8 +2972,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->queue_snapshot.entry_size);
>   		break;
>   	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
> -		pr_warn("Debug op %i not supported yet\n", args->op);
> -		r = -EACCES;
> +		r = kfd_dbg_trap_device_snapshot(target,
> +				args->device_snapshot.exception_mask,
> +				(void __user *)args->device_snapshot.snapshot_buf_ptr,
> +				&args->device_snapshot.num_devices,
> +				&args->device_snapshot.entry_size);
>   		break;
>   	default:
>   		pr_err("Invalid option: %i\n", args->op);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index db316f0625f8..d1c4eb9652fd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -22,6 +22,7 @@
>   
>   #include "kfd_debug.h"
>   #include "kfd_device_queue_manager.h"
> +#include "kfd_topology.h"
>   #include <linux/file.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   
> @@ -998,6 +999,77 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size)
> +{
> +	struct kfd_dbg_device_info_entry device_info;
> +	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
> +	int i, r = 0;
> +
> +	if (!(target && user_info && number_of_device_infos && entry_size))
> +		return -EINVAL;
> +
> +	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
> +	*number_of_device_infos = target->n_pdds;
> +	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
> +
> +	if (!tmp_num_devices)
> +		return 0;
> +
> +	memset(&device_info, 0, sizeof(device_info));
> +
> +	mutex_lock(&target->event_mutex);
> +
> +	/* Run over all pdd of the process */
> +	for (i = 0; i < tmp_num_devices; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
> +
> +		device_info.gpu_id = pdd->dev->id;
> +		device_info.exception_status = pdd->exception_status;
> +		device_info.lds_base = pdd->lds_base;
> +		device_info.lds_limit = pdd->lds_limit;
> +		device_info.scratch_base = pdd->scratch_base;
> +		device_info.scratch_limit = pdd->scratch_limit;
> +		device_info.gpuvm_base = pdd->gpuvm_base;
> +		device_info.gpuvm_limit = pdd->gpuvm_limit;
> +		device_info.location_id = topo_dev->node_props.location_id;
> +		device_info.vendor_id = topo_dev->node_props.vendor_id;
> +		device_info.device_id = topo_dev->node_props.device_id;
> +		device_info.revision_id = pdd->dev->adev->pdev->revision;
> +		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
> +		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
> +		device_info.fw_version = pdd->dev->mec_fw_version;
> +		device_info.gfx_target_version =
> +			topo_dev->node_props.gfx_target_version;
> +		device_info.simd_count = topo_dev->node_props.simd_count;
> +		device_info.max_waves_per_simd =
> +			topo_dev->node_props.max_waves_per_simd;
> +		device_info.array_count = topo_dev->node_props.array_count;
> +		device_info.simd_arrays_per_engine =
> +			topo_dev->node_props.simd_arrays_per_engine;
> +		device_info.capability = topo_dev->node_props.capability;
> +		device_info.debug_prop = topo_dev->node_props.debug_prop;
> +
> +		if (exception_clear_mask)
> +			pdd->exception_status &= ~exception_clear_mask;
> +
> +		if (copy_to_user(user_info, &device_info, *entry_size)) {
> +			r = -EFAULT;
> +			break;
> +		}
> +
> +		user_info += tmp_entry_size;
> +	}
> +
> +	mutex_unlock(&target->event_mutex);
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index ee12de5f7adf..b31e453704fc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -81,6 +81,11 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
>   }
>   
>   void debug_event_write_work_handler(struct work_struct *work);
> +int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
> +		uint64_t exception_clear_mask,
> +		void __user *user_info,
> +		uint32_t *number_of_device_infos,
> +		uint32_t *entry_size);
>   
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask);
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 93b288233577..da74a6ef4d9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2972,8 +2972,11 @@  static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
 				&args->queue_snapshot.entry_size);
 		break;
 	case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-		pr_warn("Debug op %i not supported yet\n", args->op);
-		r = -EACCES;
+		r = kfd_dbg_trap_device_snapshot(target,
+				args->device_snapshot.exception_mask,
+				(void __user *)args->device_snapshot.snapshot_buf_ptr,
+				&args->device_snapshot.num_devices,
+				&args->device_snapshot.entry_size);
 		break;
 	default:
 		pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index db316f0625f8..d1c4eb9652fd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@ 
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include <linux/file.h>
 #include <uapi/linux/kfd_ioctl.h>
 
@@ -998,6 +999,77 @@  int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
 	return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size)
+{
+	struct kfd_dbg_device_info_entry device_info;
+	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+	int i, r = 0;
+
+	if (!(target && user_info && number_of_device_infos && entry_size))
+		return -EINVAL;
+
+	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
+	*number_of_device_infos = target->n_pdds;
+	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+	if (!tmp_num_devices)
+		return 0;
+
+	memset(&device_info, 0, sizeof(device_info));
+
+	mutex_lock(&target->event_mutex);
+
+	/* Run over all pdd of the process */
+	for (i = 0; i < tmp_num_devices; i++) {
+		struct kfd_process_device *pdd = target->pdds[i];
+		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
+
+		device_info.gpu_id = pdd->dev->id;
+		device_info.exception_status = pdd->exception_status;
+		device_info.lds_base = pdd->lds_base;
+		device_info.lds_limit = pdd->lds_limit;
+		device_info.scratch_base = pdd->scratch_base;
+		device_info.scratch_limit = pdd->scratch_limit;
+		device_info.gpuvm_base = pdd->gpuvm_base;
+		device_info.gpuvm_limit = pdd->gpuvm_limit;
+		device_info.location_id = topo_dev->node_props.location_id;
+		device_info.vendor_id = topo_dev->node_props.vendor_id;
+		device_info.device_id = topo_dev->node_props.device_id;
+		device_info.revision_id = pdd->dev->adev->pdev->revision;
+		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
+		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
+		device_info.fw_version = pdd->dev->mec_fw_version;
+		device_info.gfx_target_version =
+			topo_dev->node_props.gfx_target_version;
+		device_info.simd_count = topo_dev->node_props.simd_count;
+		device_info.max_waves_per_simd =
+			topo_dev->node_props.max_waves_per_simd;
+		device_info.array_count = topo_dev->node_props.array_count;
+		device_info.simd_arrays_per_engine =
+			topo_dev->node_props.simd_arrays_per_engine;
+		device_info.capability = topo_dev->node_props.capability;
+		device_info.debug_prop = topo_dev->node_props.debug_prop;
+
+		if (exception_clear_mask)
+			pdd->exception_status &= ~exception_clear_mask;
+
+		if (copy_to_user(user_info, &device_info, *entry_size)) {
+			r = -EFAULT;
+			break;
+		}
+
+		user_info += tmp_entry_size;
+	}
+
+	mutex_unlock(&target->event_mutex);
+
+	return r;
+}
+
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ee12de5f7adf..b31e453704fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -81,6 +81,11 @@  static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
 }
 
 void debug_event_write_work_handler(struct work_struct *work);
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+		uint64_t exception_clear_mask,
+		void __user *user_info,
+		uint32_t *number_of_device_infos,
+		uint32_t *entry_size);
 
 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
 					uint64_t exception_set_mask);