diff mbox series

[RESEND,4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.

Message ID 1576096732-3596-4-git-send-email-andrey.grodzovsky@amd.com (mailing list archive)
State New, archived
Headers show
Series [RESEND,1/5] drm/amdgpu: reverts commit b01245ff54db66073b104ac9d9fbefb7b264b36d. | expand

Commit Message

Andrey Grodzovsky Dec. 11, 2019, 8:38 p.m. UTC
Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

Comments

Ma, Le Dec. 12, 2019, 4:05 a.m. UTC | #1
[AMD Official Use Only - Internal Distribution Only]






-----Original Message-----
From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Sent: Thursday, December 12, 2019 4:39 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset synchronization.



Use task barrier in XGMI hive to synchronize ASIC resets across devices in XGMI hive.



Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com<mailto:andrey.grodzovsky@amd.com>>

---

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 +++++++++++++++++++++++++-----

1 file changed, 36 insertions(+), 6 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1d19edfa..e4089a0 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

@@ -67,6 +67,7 @@

#include "amdgpu_tmz.h"



 #include <linux/suspend.h>

+#include <drm/task_barrier.h>



 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");

MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");

@@ -2663,14 +2664,43 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {

           struct amdgpu_device *adev =

                       container_of(__work, struct amdgpu_device, xgmi_reset_work);

+          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);



-           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

-                       adev->asic_reset_res = (adev->in_baco == false) ?

-                                               amdgpu_device_baco_enter(adev->ddev) :

-                                               qamdgpu_device_baco_exit(adev->ddev);

-           else

-                       adev->asic_reset_res = amdgpu_asic_reset(adev);

+          /*

+          * Use task barrier to synchronize all xgmi reset works across the

+          * hive.

+          * task_barrier_enter and task_barrier_exit will block untill all the

+          * threads running the xgmi reset works reach those points. I assume

+          * guarantee of progress here for all the threads as the workqueue code

+          * creates new worker threads as needed by amount of work items in queue

+          * (see worker_thread) and also each thread sleeps in the barrir and by

+          * this yielding the CPU for other work threads to make progress.

+          */

[Le]: This comments can be adjusted since we switch to system_unbound_wq in patch #5.

+          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

+

+                      if (hive)

+                                  task_barrier_enter(&hive->tb);

[Le]: The multiple hive condition can be checked only once and moved to the location right after the assignment.

+

+                      adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+

+                      if (hive)

+                                  task_barrier_exit(&hive->tb);

[Le]: Same as above.

+

+                      adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);

+

+                      if (adev->asic_reset_res)

+                                  goto fail;

+          } else {

+                      if (hive)

+                                  task_barrier_full(&hive->tb);

[Le]: Same as above.



With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com<mailto:Le.Ma@amd.com>>



Regards,

Ma Le

+

+                      adev->asic_reset_res =  amdgpu_asic_reset(adev);

+          }



+fail:

           if (adev->asic_reset_res)

                       DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

                                    adev->asic_reset_res, adev->ddev->unique);

--

2.7.4
Andrey Grodzovsky Dec. 12, 2019, 3:20 p.m. UTC | #2
On 12/11/19 11:05 PM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> -----Original Message-----
> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> Sent: Thursday, December 12, 2019 4:39 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Ma, Le 
> <Le.Ma@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Quan, Evan 
> <Evan.Quan@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [RESEND PATCH 4/5] Subject: drm/amdgpu: Redo XGMI reset 
> synchronization.
>
> Use task barrier in XGMI hive to synchronize ASIC resets across 
> devices in XGMI hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com 
> <mailto:andrey.grodzovsky@amd.com>>
>
> ---
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 42 
> +++++++++++++++++++++++++-----
>
> 1 file changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> index 1d19edfa..e4089a0 100644
>
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
> @@ -67,6 +67,7 @@
>
> #include "amdgpu_tmz.h"
>
>  #include <linux/suspend.h>
>
> +#include <drm/task_barrier.h>
>
>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>
> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>
> @@ -2663,14 +2664,43 @@ static void 
> amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {
>
>            struct amdgpu_device *adev =
>
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> +          struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
>
> -           if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
> - adev->asic_reset_res = (adev->in_baco == false) ?
>
> - amdgpu_device_baco_enter(adev->ddev) :
>
> - qamdgpu_device_baco_exit(adev->ddev);
>
> -           else
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
>
> +          /*
>
> +          * Use task barrier to synchronize all xgmi reset works 
> across the
>
> +          * hive.
>
> +          * task_barrier_enter and task_barrier_exit will block 
> untill all the
>
> +          * threads running the xgmi reset works reach those points. 
> I assume
>
> +          * guarantee of progress here for all the threads as the 
> workqueue code
>
> +          * creates new worker threads as needed by amount of work 
> items in queue
>
> +          * (see worker_thread) and also each thread sleeps in the 
> barrir and by
>
> +          * this yielding the CPU for other work threads to make 
> progress.
>
> +          */
>
> [Le]: This comments can be adjusted since we switch to 
> system_unbound_wq in patch #5.
>
> +          if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>
> +
>
> +                      if (hive)
>
> + task_barrier_enter(&hive->tb);
>
> [Le]: The multiple hive condition can be checked only once and moved 
> to the location right after the assignment.
>

Not sure what you meant here but in fact let's note that while in 
amdgpu_device_xgmi_reset_func it's a bug for amdgpu_get_xgmi_hive to 
return NULL so I think better instead to add WARN_ON(!hive,"...") and 
return right at the beginning of the function if indeed hive == NULL

Andrey


> +
>
> + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +
>
> +                      if (hive)
>
> + task_barrier_exit(&hive->tb);
>
> [Le]: Same as above.
>
> +
>
> + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
>
> +
>
> +                      if (adev->asic_reset_res)
>
> +                                  goto fail;
>
> +          } else {
>
> +                      if (hive)
>
> + task_barrier_full(&hive->tb);
>
> [Le]: Same as above.
>
> With above addressed, Reviewed-by: Le Ma <Le.Ma@amd.com 
> <mailto:Le.Ma@amd.com>>
>
> Regards,
>
> Ma Le
>
> +
>
> + adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
> +          }
>
> +fail:
>
>            if (adev->asic_reset_res)
>
>                        DRM_WARN("ASIC reset failed with error, %d for 
> drm dev, %s",
>
>  adev->asic_reset_res, adev->ddev->unique);
>
> --
>
> 2.7.4
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1d19edfa..e4089a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -67,6 +67,7 @@ 
 #include "amdgpu_tmz.h"
 
 #include <linux/suspend.h>
+#include <drm/task_barrier.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -2663,14 +2664,43 @@  static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 
-	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
-		adev->asic_reset_res = (adev->in_baco == false) ?
-				amdgpu_device_baco_enter(adev->ddev) :
-				qamdgpu_device_baco_exit(adev->ddev);
-	else
-		adev->asic_reset_res = amdgpu_asic_reset(adev);
+	/*
+	 * Use task barrier to synchronize all xgmi reset works across the
+	 * hive.
+	 * task_barrier_enter and task_barrier_exit will block untill all the
+	 * threads running the xgmi reset works reach those points. I assume
+	 * guarantee of progress here for all the threads as the workqueue code
+	 * creates new worker threads as needed by amount of work items in queue
+	 * (see worker_thread) and also each thread sleeps in the barrir and by
+	 * this yielding the CPU for other work threads to make progress.
+	 */
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+		if (hive)
+			task_barrier_enter(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+
+		if (hive)
+			task_barrier_exit(&hive->tb);
+
+		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+		if (adev->asic_reset_res)
+			goto fail;
+	} else {
+		if (hive)
+			task_barrier_full(&hive->tb);
+
+		adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	}
 
+fail:
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);