diff mbox series

[1/2] drm/amdgpu: Move racy global PMU list into device

Message ID 20221028224813.1466450-1-briannorris@chromium.org (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/amdgpu: Move racy global PMU list into device | expand

Commit Message

Brian Norris Oct. 28, 2022, 10:48 p.m. UTC
If there are multiple amdgpu devices, this list processing can be racy.

We're really treating this like a per-device list, so make that explicit
and remove the global list.

Signed-off-by: Brian Norris <briannorris@chromium.org>
---

 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 12 +++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

Comments

Alex Deucher Nov. 8, 2022, 4:11 p.m. UTC | #1
On Fri, Oct 28, 2022 at 6:48 PM Brian Norris <briannorris@chromium.org> wrote:
>
> If there are multiple amdgpu devices, this list processing can be racy.
>
> We're really treating this like a per-device list, so make that explicit
> and remove the global list.
>
> Signed-off-by: Brian Norris <briannorris@chromium.org>

@Kuehling, Felix @Kim, Jonathan can you take a look at this patch?

Thanks,

Alex


> ---
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 12 +++++-------
>  2 files changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..e968b7f2417c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1063,6 +1063,10 @@ struct amdgpu_device {
>         struct work_struct              reset_work;
>
>         bool                            job_hang;
> +
> +#if IS_ENABLED(CONFIG_PERF_EVENTS)
> +       struct list_head pmu_list;
> +#endif
>  };
>
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> index 71ee361d0972..24f2055a2f23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> @@ -23,6 +23,7 @@
>
>  #include <linux/perf_event.h>
>  #include <linux/init.h>
> +#include <linux/list.h>
>  #include "amdgpu.h"
>  #include "amdgpu_pmu.h"
>
> @@ -72,9 +73,6 @@ static ssize_t amdgpu_pmu_event_show(struct device *dev,
>                         amdgpu_pmu_attr->event_str, amdgpu_pmu_attr->type);
>  }
>
> -static LIST_HEAD(amdgpu_pmu_list);
> -
> -
>  struct amdgpu_pmu_attr {
>         const char *name;
>         const char *config;
> @@ -558,7 +556,7 @@ static int init_pmu_entry_by_type_and_add(struct amdgpu_pmu_entry *pmu_entry,
>                 pr_info("Detected AMDGPU %d Perf Events.\n", total_num_events);
>
>
> -       list_add_tail(&pmu_entry->entry, &amdgpu_pmu_list);
> +       list_add_tail(&pmu_entry->entry, &pmu_entry->adev->pmu_list);
>
>         return 0;
>  err_register:
> @@ -579,9 +577,7 @@ void amdgpu_pmu_fini(struct amdgpu_device *adev)
>  {
>         struct amdgpu_pmu_entry *pe, *temp;
>
> -       list_for_each_entry_safe(pe, temp, &amdgpu_pmu_list, entry) {
> -               if (pe->adev != adev)
> -                       continue;
> +       list_for_each_entry_safe(pe, temp, &adev->pmu_list, entry) {
>                 list_del(&pe->entry);
>                 perf_pmu_unregister(&pe->pmu);
>                 kfree(pe->pmu.attr_groups);
> @@ -623,6 +619,8 @@ int amdgpu_pmu_init(struct amdgpu_device *adev)
>         int ret = 0;
>         struct amdgpu_pmu_entry *pmu_entry, *pmu_entry_df;
>
> +       INIT_LIST_HEAD(&adev->pmu_list);
> +
>         switch (adev->asic_type) {
>         case CHIP_VEGA20:
>                 pmu_entry_df = create_pmu_entry(adev, AMDGPU_PMU_PERF_TYPE_DF,
> --
> 2.38.1.273.g43a17bfeac-goog
>
Felix Kuehling Nov. 8, 2022, 4:50 p.m. UTC | #2
On 2022-10-28 18:48, Brian Norris wrote:
> If there are multiple amdgpu devices, this list processing can be racy.
>
> We're really treating this like a per-device list, so make that explicit
> and remove the global list.

I agree with the problem and the solution. See one comment inline.


>
> Signed-off-by: Brian Norris <briannorris@chromium.org>
> ---
>
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 12 +++++-------
>   2 files changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..e968b7f2417c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1063,6 +1063,10 @@ struct amdgpu_device {
>   	struct work_struct		reset_work;
>   
>   	bool                            job_hang;
> +
> +#if IS_ENABLED(CONFIG_PERF_EVENTS)
> +	struct list_head pmu_list;
> +#endif
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> index 71ee361d0972..24f2055a2f23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
> @@ -23,6 +23,7 @@
>   
>   #include <linux/perf_event.h>
>   #include <linux/init.h>
> +#include <linux/list.h>
>   #include "amdgpu.h"
>   #include "amdgpu_pmu.h"
>   
> @@ -72,9 +73,6 @@ static ssize_t amdgpu_pmu_event_show(struct device *dev,
>   			amdgpu_pmu_attr->event_str, amdgpu_pmu_attr->type);
>   }
>   
> -static LIST_HEAD(amdgpu_pmu_list);
> -
> -
>   struct amdgpu_pmu_attr {
>   	const char *name;
>   	const char *config;
> @@ -558,7 +556,7 @@ static int init_pmu_entry_by_type_and_add(struct amdgpu_pmu_entry *pmu_entry,
>   		pr_info("Detected AMDGPU %d Perf Events.\n", total_num_events);
>   
>   
> -	list_add_tail(&pmu_entry->entry, &amdgpu_pmu_list);
> +	list_add_tail(&pmu_entry->entry, &pmu_entry->adev->pmu_list);

While you're making the pmu list per-device, I'd suggest removing adev 
from the pmu entry because it is now redundant. The device is implied by 
the list that the entry is on. Instead, add an adev parameter to 
init_pmu_entry_by_type_and_add. Or you could move the list_add_tail to 
amdgpu_pmu_init and remove "_and_add" from the function name.

Other than that, the patch looks good to me.

Regards,
   Felix


>   
>   	return 0;
>   err_register:
> @@ -579,9 +577,7 @@ void amdgpu_pmu_fini(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_pmu_entry *pe, *temp;
>   
> -	list_for_each_entry_safe(pe, temp, &amdgpu_pmu_list, entry) {
> -		if (pe->adev != adev)
> -			continue;
> +	list_for_each_entry_safe(pe, temp, &adev->pmu_list, entry) {
>   		list_del(&pe->entry);
>   		perf_pmu_unregister(&pe->pmu);
>   		kfree(pe->pmu.attr_groups);
> @@ -623,6 +619,8 @@ int amdgpu_pmu_init(struct amdgpu_device *adev)
>   	int ret = 0;
>   	struct amdgpu_pmu_entry *pmu_entry, *pmu_entry_df;
>   
> +	INIT_LIST_HEAD(&adev->pmu_list);
> +
>   	switch (adev->asic_type) {
>   	case CHIP_VEGA20:
>   		pmu_entry_df = create_pmu_entry(adev, AMDGPU_PMU_PERF_TYPE_DF,
Brian Norris Nov. 9, 2022, 1:22 a.m. UTC | #3
On Tue, Nov 08, 2022 at 11:50:04AM -0500, Felix Kuehling wrote:
> While you're making the pmu list per-device, I'd suggest removing adev from
> the pmu entry because it is now redundant. The device is implied by the list
> that the entry is on. Instead, add an adev parameter to
> init_pmu_entry_by_type_and_add. Or you could move the list_add_tail to
> amdgpu_pmu_init and remove "_and_add" from the function name.

Sorry if I'm being naive here, but does that mean trying to navigate the
list pointers to move from a 'pmu_entry' to an 'adev'
(list_first_entry(), etc.)? There are quite a few cases where we're
trying to go between 'pmu_entry' and 'adev'. I guess I could turn that
into a mini helper.

I'll also need to scrounge around a bit to see if I have an amdgpu
system around that actually supports PMU. I realized the one I tested on
doesn't actually hit this code path... and this would be getting a
little less obvious/trivial.

> Other than that, the patch looks good to me.

Thanks for looking!

Brian
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0e6ddf05c23c..e968b7f2417c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1063,6 +1063,10 @@  struct amdgpu_device {
 	struct work_struct		reset_work;
 
 	bool                            job_hang;
+
+#if IS_ENABLED(CONFIG_PERF_EVENTS)
+	struct list_head pmu_list;
+#endif
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
index 71ee361d0972..24f2055a2f23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c
@@ -23,6 +23,7 @@ 
 
 #include <linux/perf_event.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include "amdgpu.h"
 #include "amdgpu_pmu.h"
 
@@ -72,9 +73,6 @@  static ssize_t amdgpu_pmu_event_show(struct device *dev,
 			amdgpu_pmu_attr->event_str, amdgpu_pmu_attr->type);
 }
 
-static LIST_HEAD(amdgpu_pmu_list);
-
-
 struct amdgpu_pmu_attr {
 	const char *name;
 	const char *config;
@@ -558,7 +556,7 @@  static int init_pmu_entry_by_type_and_add(struct amdgpu_pmu_entry *pmu_entry,
 		pr_info("Detected AMDGPU %d Perf Events.\n", total_num_events);
 
 
-	list_add_tail(&pmu_entry->entry, &amdgpu_pmu_list);
+	list_add_tail(&pmu_entry->entry, &pmu_entry->adev->pmu_list);
 
 	return 0;
 err_register:
@@ -579,9 +577,7 @@  void amdgpu_pmu_fini(struct amdgpu_device *adev)
 {
 	struct amdgpu_pmu_entry *pe, *temp;
 
-	list_for_each_entry_safe(pe, temp, &amdgpu_pmu_list, entry) {
-		if (pe->adev != adev)
-			continue;
+	list_for_each_entry_safe(pe, temp, &adev->pmu_list, entry) {
 		list_del(&pe->entry);
 		perf_pmu_unregister(&pe->pmu);
 		kfree(pe->pmu.attr_groups);
@@ -623,6 +619,8 @@  int amdgpu_pmu_init(struct amdgpu_device *adev)
 	int ret = 0;
 	struct amdgpu_pmu_entry *pmu_entry, *pmu_entry_df;
 
+	INIT_LIST_HEAD(&adev->pmu_list);
+
 	switch (adev->asic_type) {
 	case CHIP_VEGA20:
 		pmu_entry_df = create_pmu_entry(adev, AMDGPU_PMU_PERF_TYPE_DF,