diff mbox series

[v2] drm/amdgpu:fix IH overflow on Cz

Message ID 1609831941-2277695-1-git-send-email-bodefang@126.com (mailing list archive)
State New, archived
Headers show
Series [v2] drm/amdgpu:fix IH overflow on Cz | expand

Commit Message

Defang Bo Jan. 5, 2021, 7:32 a.m. UTC
Similar to commit <b82175750131>("drm/amdgpu: fix IH overflow on Vega10 v2").
When an ring buffer overflow happens the appropriate bit is set in the WPTR
register which is also written back to memory. But clearing the bit in the
WPTR doesn't trigger another memory writeback.

So what can happen is that we end up processing the buffer overflow over and
over again because the bit is never cleared. Resulting in a random system
lockup because of an infinite loop in an interrupt handler.

Signed-off-by: Defang Bo <bodefang@126.com>
---
Changes since v1:
- Modify the code and correct the wrong fix. 
---
---
 drivers/gpu/drm/amd/amdgpu/cz_ih.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

Comments

Christian König Jan. 5, 2021, 10:27 a.m. UTC | #1
Am 05.01.21 um 08:32 schrieb Defang Bo:
> Similar to commit <b82175750131>("drm/amdgpu: fix IH overflow on Vega10 v2").
> When an ring buffer overflow happens the appropriate bit is set in the WPTR
> register which is also written back to memory. But clearing the bit in the
> WPTR doesn't trigger another memory writeback.
>
> So what can happen is that we end up processing the buffer overflow over and
> over again because the bit is never cleared. Resulting in a random system
> lockup because of an infinite loop in an interrupt handler.

Really good point. I haven't had time to look into other generations 
since fixing this for Vega.

One major typo below which needs to be fixed.

>
> Signed-off-by: Defang Bo <bodefang@126.com>
> ---
> Changes since v1:
> - Modify the code and correct the wrong fix.
> ---
> ---
>   drivers/gpu/drm/amd/amdgpu/cz_ih.c | 38 ++++++++++++++++++++++++--------------
>   1 file changed, 24 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> index 1dca0cabc326..45dd47f45fa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> @@ -190,22 +190,32 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
>   			  struct amdgpu_ih_ring *ih)
>   {
>   	u32 wptr, tmp;
> -
> +
>   	wptr = le32_to_cpu(*ih->wptr_cpu);
>   
> -	if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
> -		wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> -		/* When a ring buffer overflow happen start parsing interrupt
> -		 * from the last not overwritten vector (wptr + 16). Hopefully
> -		 * this should allow us to catchup.
> -		 */
> -		dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> -			wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> -		ih->rptr = (wptr + 16) & ih->ptr_mask;
> -		tmp = RREG32(mmIH_RB_CNTL);
> -		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> -		WREG32(mmIH_RB_CNTL, tmp);
> -	}
> +	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> +		goto out;
> +
> +	wptr = RREG32(mmIH_RB_CNTL);

That's the wrong register, you need to read the write pointer and not 
the control register.

Same problem in all other patches.

Regards,
Christian.

> +
> +	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
> +		goto out;
> +
> +	wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
> +
> +	/* When a ring buffer overflow happen start parsing interrupt
> +	 * from the last not overwritten vector (wptr + 16). Hopefully
> +	 * this should allow us to catchup.
> +	 */
> +	dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
> +		wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
> +	ih->rptr = (wptr + 16) & ih->ptr_mask;
> +	tmp = RREG32(mmIH_RB_CNTL);
> +	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
> +	WREG32(mmIH_RB_CNTL, tmp);
> +
> +
> +out:
>   	return (wptr & ih->ptr_mask);
>   }
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
index 1dca0cabc326..45dd47f45fa2 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
@@ -190,22 +190,32 @@  static u32 cz_ih_get_wptr(struct amdgpu_device *adev,
 			  struct amdgpu_ih_ring *ih)
 {
 	u32 wptr, tmp;
-
+
 	wptr = le32_to_cpu(*ih->wptr_cpu);
 
-	if (REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW)) {
-		wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
-		/* When a ring buffer overflow happen start parsing interrupt
-		 * from the last not overwritten vector (wptr + 16). Hopefully
-		 * this should allow us to catchup.
-		 */
-		dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
-			wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
-		ih->rptr = (wptr + 16) & ih->ptr_mask;
-		tmp = RREG32(mmIH_RB_CNTL);
-		tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
-		WREG32(mmIH_RB_CNTL, tmp);
-	}
+	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
+		goto out;
+
+	wptr = RREG32(mmIH_RB_CNTL);
+
+	if (!REG_GET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW))
+		goto out;
+
+	wptr = REG_SET_FIELD(wptr, IH_RB_WPTR, RB_OVERFLOW, 0);
+
+	/* When a ring buffer overflow happen start parsing interrupt
+	 * from the last not overwritten vector (wptr + 16). Hopefully
+	 * this should allow us to catchup.
+	 */
+	dev_warn(adev->dev, "IH ring buffer overflow (0x%08X, 0x%08X, 0x%08X)\n",
+		wptr, ih->rptr, (wptr + 16) & ih->ptr_mask);
+	ih->rptr = (wptr + 16) & ih->ptr_mask;
+	tmp = RREG32(mmIH_RB_CNTL);
+	tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1);
+	WREG32(mmIH_RB_CNTL, tmp);
+
+
+out:
 	return (wptr & ih->ptr_mask);
 }