diff mbox

radeon error on resume: "cp failed to get scratch reg" - anyone interested?

Message ID 1348045973.3218.16.camel@thor.local (mailing list archive)
State New, archived
Headers show

Commit Message

Michel Dänzer Sept. 19, 2012, 9:12 a.m. UTC
On Mon, 2012-09-17 at 15:06 +0200, Simon Kitching wrote: 
> Hi,
> 
> I've noticed that on resume from suspend, dmesg reports:
> 
> [21895.997536] [drm] radeon: 1 quad pipes, 2 z pipes initialized.
> [21896.012072] [drm] PCIE GART of 512M enabled (table at 
> 0x0000000000040000).
> [21896.012082] radeon 0000:01:00.0: WB enabled
> [21896.012085] radeon 0000:01:00.0: fence driver on ring 0 use gpu addr 
> 0x0000000010000000 and cpu addr 0xffccb000
> [21896.012138] [drm] radeon: ring at 0x0000000010001000
> [21896.012157] [drm:r100_ring_test] *ERROR* radeon: cp failed to get 
> scratch reg (-22).
> [21896.012158] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).
> [21896.012160] radeon 0000:01:00.0: failed initializing CP (-22).
> 
> Graphics still seems to work fine though.
> 
> Is this info of interest to anyone? I'm happy to do additional testing 
> if that is useful.

Does the patch below fix the failure to get a scratch register?


From 6780dca112752e1fc4e7d410441b39874565b754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
Date: Wed, 19 Sep 2012 11:09:14 +0200
Subject: [PATCH] drm/radeon: Fix scratch register leak in IB test.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restructure the code to jump out via labels instead of directly returning
early.
---
 drivers/gpu/drm/radeon/r100.c |   12 ++++++------
 drivers/gpu/drm/radeon/r600.c |   12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

Comments

Simon Kitching Sept. 20, 2012, 6:54 a.m. UTC | #1
On 19/09/12 11:12, Michel Dänzer wrote:
> On Mon, 2012-09-17 at 15:06 +0200, Simon Kitching wrote:
>> Hi,
>>
>> I've noticed that on resume from suspend, dmesg reports:
>>
>> [21895.997536] [drm] radeon: 1 quad pipes, 2 z pipes initialized.
>> [21896.012072] [drm] PCIE GART of 512M enabled (table at
>> 0x0000000000040000).
>> [21896.012082] radeon 0000:01:00.0: WB enabled
>> [21896.012085] radeon 0000:01:00.0: fence driver on ring 0 use gpu addr
>> 0x0000000010000000 and cpu addr 0xffccb000
>> [21896.012138] [drm] radeon: ring at 0x0000000010001000
>> [21896.012157] [drm:r100_ring_test] *ERROR* radeon: cp failed to get
>> scratch reg (-22).
>> [21896.012158] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).
>> [21896.012160] radeon 0000:01:00.0: failed initializing CP (-22).
>>
>> Graphics still seems to work fine though.
>>
>> Is this info of interest to anyone? I'm happy to do additional testing
>> if that is useful.
> Does the patch below fix the failure to get a scratch register?
>
>
>  From 6780dca112752e1fc4e7d410441b39874565b754 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
> Date: Wed, 19 Sep 2012 11:09:14 +0200
> Subject: [PATCH] drm/radeon: Fix scratch register leak in IB test.
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> Restructure the code to jump out via labels instead of directly returning
> early.
> ---
>   drivers/gpu/drm/radeon/r100.c |   12 ++++++------
>   drivers/gpu/drm/radeon/r600.c |   12 ++++++------
>   2 files changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 8acb34f..819c352 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3818,7 +3818,7 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   	WREG32(scratch, 0xCAFEDEAD);
>   	r = radeon_ib_get(rdev, RADEON_RING_TYPE_GFX_INDEX, &ib, 256);
>   	if (r) {
> -		return r;
> +		goto free_scratch;
>   	}
>   	ib.ptr[0] = PACKET0(scratch, 0);
>   	ib.ptr[1] = 0xDEADBEEF;
> @@ -3831,13 +3831,11 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   	ib.length_dw = 8;
>   	r = radeon_ib_schedule(rdev, &ib, NULL);
>   	if (r) {
> -		radeon_scratch_free(rdev, scratch);
> -		radeon_ib_free(rdev, &ib);
> -		return r;
> +		goto free_ib;
>   	}
>   	r = radeon_fence_wait(ib.fence, false);
>   	if (r) {
> -		return r;
> +		goto free_ib;
>   	}
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
> @@ -3853,8 +3851,10 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   			  scratch, tmp);
>   		r = -EINVAL;
>   	}
> -	radeon_scratch_free(rdev, scratch);
> +free_ib:
>   	radeon_ib_free(rdev, &ib);
> +free_scratch:
> +	radeon_scratch_free(rdev, scratch);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index d79c639..38b546e 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -2638,7 +2638,7 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   	r = radeon_ib_get(rdev, ring->idx, &ib, 256);
>   	if (r) {
>   		DRM_ERROR("radeon: failed to get ib (%d).\n", r);
> -		return r;
> +		goto free_scratch;
>   	}
>   	ib.ptr[0] = PACKET3(PACKET3_SET_CONFIG_REG, 1);
>   	ib.ptr[1] = ((scratch - PACKET3_SET_CONFIG_REG_OFFSET) >> 2);
> @@ -2646,15 +2646,13 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   	ib.length_dw = 3;
>   	r = radeon_ib_schedule(rdev, &ib, NULL);
>   	if (r) {
> -		radeon_scratch_free(rdev, scratch);
> -		radeon_ib_free(rdev, &ib);
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> -		return r;
> +		goto free_ib;
>   	}
>   	r = radeon_fence_wait(ib.fence, false);
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> -		return r;
> +		goto free_ib;
>   	}
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
> @@ -2669,8 +2667,10 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   			  scratch, tmp);
>   		r = -EINVAL;
>   	}
> -	radeon_scratch_free(rdev, scratch);
> +free_ib:
>   	radeon_ib_free(rdev, &ib);
> +free_scratch:
> +	radeon_scratch_free(rdev, scratch);
>   	return r;
>   }
>   
Sadly the patch does not fix the issue. However I agree that this patch 
is a good thing - the original code does clearly leak scratch registers 
in some failure conditions.

Maybe this patch could also add the DRM_ERROR calls for these error 
cases from the r600 version into the r100 version?

Note that on failure of radeon_ib_schedule, the order of resource 
releases *was*
   radeon_scratch_free
   radeon_ib_free
and with this patch is:
   radeon_ib_free
   radeon_scratch_free
This change looks correct to me, ie post-patch the frees are in reverse 
order to original allocation.

If you do submit this patch, please add "reviewed-by 
skitching@vonos.net" or "tested-by" or whatever you think the 
appropriate tag is.

Thanks to this patch pointing me to the appropriate area, I have 
actually found the real cause of the scratch-register leak, and will 
post a candidate patch today.

Regards,Simon
Christian König Sept. 20, 2012, 8:09 a.m. UTC | #2
----------------------------------------------------------------------
From: "Simon Kitching" <skitching@vonos.net>
To: "Michel Dänzer" <michel@daenzer.net>
Date: Thu, 20 Sep 2012 09:38:56 +0200
Subject: Re: radeon error on resume: "cp failed to get scratch reg" -
anyoneinterested?


> On 19/09/12 11:12, Michel Dänzer wrote:
> > On Mon, 2012-09-17 at 15:06 +0200, Simon Kitching wrote:
> >> Hi,
> >>
> >> I've noticed that on resume from suspend, dmesg reports:
> >>
> >> [21895.997536] [drm] radeon: 1 quad pipes, 2 z pipes initialized.
> >> [21896.012072] [drm] PCIE GART of 512M enabled (table at
> >> 0x0000000000040000).
> >> [21896.012082] radeon 0000:01:00.0: WB enabled
> >> [21896.012085] radeon 0000:01:00.0: fence driver on ring 0 use gpu addr
> >> 0x0000000010000000 and cpu addr 0xffccb000
> >> [21896.012138] [drm] radeon: ring at 0x0000000010001000
> >> [21896.012157] [drm:r100_ring_test] *ERROR* radeon: cp failed to get
> >> scratch reg (-22).
> >> [21896.012158] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).
> >> [21896.012160] radeon 0000:01:00.0: failed initializing CP (-22).
> >>
> >> Graphics still seems to work fine though.
> >>
> >> Is this info of interest to anyone? I'm happy to do additional testing
> >> if that is useful.
> > Does the patch below fix the failure to get a scratch register?
> >
> >
> >  From 6780dca112752e1fc4e7d410441b39874565b754 Mon Sep 17 00:00:00 2001
> > From: =?UTF-8?q?Michel=20D=C3=A4nzer?= <michel.daenzer@amd.com>
> > Date: Wed, 19 Sep 2012 11:09:14 +0200
> > Subject: [PATCH] drm/radeon: Fix scratch register leak in IB test.
> > MIME-Version: 1.0
> > Content-Type: text/plain; charset=UTF-8
> > Content-Transfer-Encoding: 8bit
> >
> > Restructure the code to jump out via labels instead of directly returning
> > early.
> > ---
> >   drivers/gpu/drm/radeon/r100.c |   12 ++++++------
> >   drivers/gpu/drm/radeon/r600.c |   12 ++++++------
> >   2 files changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> > index 8acb34f..819c352 100644
> > --- a/drivers/gpu/drm/radeon/r100.c
> > +++ b/drivers/gpu/drm/radeon/r100.c
> > @@ -3818,7 +3818,7 @@ int r100_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   	WREG32(scratch, 0xCAFEDEAD);
> >   	r = radeon_ib_get(rdev, RADEON_RING_TYPE_GFX_INDEX, &ib, 256);
> >   	if (r) {
> > -		return r;
> > +		goto free_scratch;
> >   	}
> >   	ib.ptr[0] = PACKET0(scratch, 0);
> >   	ib.ptr[1] = 0xDEADBEEF;
> > @@ -3831,13 +3831,11 @@ int r100_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   	ib.length_dw = 8;
> >   	r = radeon_ib_schedule(rdev, &ib, NULL);
> >   	if (r) {
> > -		radeon_scratch_free(rdev, scratch);
> > -		radeon_ib_free(rdev, &ib);
> > -		return r;
> > +		goto free_ib;
> >   	}
> >   	r = radeon_fence_wait(ib.fence, false);
> >   	if (r) {
> > -		return r;
> > +		goto free_ib;
> >   	}
> >   	for (i = 0; i < rdev->usec_timeout; i++) {
> >   		tmp = RREG32(scratch);
> > @@ -3853,8 +3851,10 @@ int r100_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   			  scratch, tmp);
> >   		r = -EINVAL;
> >   	}
> > -	radeon_scratch_free(rdev, scratch);
> > +free_ib:
> >   	radeon_ib_free(rdev, &ib);
> > +free_scratch:
> > +	radeon_scratch_free(rdev, scratch);
> >   	return r;
> >   }
> >   
> > diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> > index d79c639..38b546e 100644
> > --- a/drivers/gpu/drm/radeon/r600.c
> > +++ b/drivers/gpu/drm/radeon/r600.c
> > @@ -2638,7 +2638,7 @@ int r600_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   	r = radeon_ib_get(rdev, ring->idx, &ib, 256);
> >   	if (r) {
> >   		DRM_ERROR("radeon: failed to get ib (%d).\n", r);
> > -		return r;
> > +		goto free_scratch;
> >   	}
> >   	ib.ptr[0] = PACKET3(PACKET3_SET_CONFIG_REG, 1);
> >   	ib.ptr[1] = ((scratch - PACKET3_SET_CONFIG_REG_OFFSET) >> 2);
> > @@ -2646,15 +2646,13 @@ int r600_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   	ib.length_dw = 3;
> >   	r = radeon_ib_schedule(rdev, &ib, NULL);
> >   	if (r) {
> > -		radeon_scratch_free(rdev, scratch);
> > -		radeon_ib_free(rdev, &ib);
> >   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> > -		return r;
> > +		goto free_ib;
> >   	}
> >   	r = radeon_fence_wait(ib.fence, false);
> >   	if (r) {
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> > -		return r;
> > +		goto free_ib;
> >   	}
> >   	for (i = 0; i < rdev->usec_timeout; i++) {
> >   		tmp = RREG32(scratch);
> > @@ -2669,8 +2667,10 @@ int r600_ib_test(struct radeon_device *rdev,
struct radeon_ring *ring)
> >   			  scratch, tmp);
> >   		r = -EINVAL;
> >   	}
> > -	radeon_scratch_free(rdev, scratch);
> > +free_ib:
> >   	radeon_ib_free(rdev, &ib);
> > +free_scratch:
> > +	radeon_scratch_free(rdev, scratch);
> >   	return r;
> >   }
> >   
> Sadly the patch does not fix the issue. However I agree that this patch 
> is a good thing - the original code does clearly leak scratch registers 
> in some failure conditions.
> 
> Maybe this patch could also add the DRM_ERROR calls for these error 
> cases from the r600 version into the r100 version?

Sounds like a good idea to me.

> 
> Note that on failure of radeon_ib_schedule, the order of resource 
> releases *was*
>    radeon_scratch_free
>    radeon_ib_free
> and with this patch is:
>    radeon_ib_free
>    radeon_scratch_free
> This change looks correct to me, ie post-patch the frees are in reverse 
> order to original allocation.

The order doesn't really matter in that case, so the patch Michel proposed
should do quite fine. So please add:

Reviewed-by: Christian König <christian.koenig@amd.com>

> 
> If you do submit this patch, please add "reviewed-by 
> skitching@vonos.net" or "tested-by" or whatever you think the 
> appropriate tag is.
> 
> Thanks to this patch pointing me to the appropriate area, I have 
> actually found the real cause of the scratch-register leak, and will 
> post a candidate patch today.

Sounds good, please post the patch, or at least a description of what is
really going wrong here.

Cheers,
Christian.

> 
> Regards,Simon
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>
Michel Dänzer Sept. 20, 2012, 9:06 a.m. UTC | #3
On Don, 2012-09-20 at 08:54 +0200, Simon Kitching wrote: 
> On 19/09/12 11:12, Michel Dänzer wrote:
> > On Mon, 2012-09-17 at 15:06 +0200, Simon Kitching wrote:
> >> Hi,
> >>
> >> I've noticed that on resume from suspend, dmesg reports:
> >>
> >> [21895.997536] [drm] radeon: 1 quad pipes, 2 z pipes initialized.
> >> [21896.012072] [drm] PCIE GART of 512M enabled (table at
> >> 0x0000000000040000).
> >> [21896.012082] radeon 0000:01:00.0: WB enabled
> >> [21896.012085] radeon 0000:01:00.0: fence driver on ring 0 use gpu addr
> >> 0x0000000010000000 and cpu addr 0xffccb000
> >> [21896.012138] [drm] radeon: ring at 0x0000000010001000
> >> [21896.012157] [drm:r100_ring_test] *ERROR* radeon: cp failed to get
> >> scratch reg (-22).
> >> [21896.012158] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).
> >> [21896.012160] radeon 0000:01:00.0: failed initializing CP (-22).
> >>
> >> Graphics still seems to work fine though.
> >>
> >> Is this info of interest to anyone? I'm happy to do additional testing
> >> if that is useful.
> > Does the patch below fix the failure to get a scratch register?

[...]

> Sadly the patch does not fix the issue. However I agree that this patch 
> is a good thing - the original code does clearly leak scratch registers 
> in some failure conditions.
> 
> Maybe this patch could also add the DRM_ERROR calls for these error 
> cases from the r600 version into the r100 version?

Yeah, that's a good idea, done.

> Note that on failure of radeon_ib_schedule, the order of resource 
> releases *was*
>    radeon_scratch_free
>    radeon_ib_free
> and with this patch is:
>    radeon_ib_free
>    radeon_scratch_free
> This change looks correct to me, ie post-patch the frees are in reverse 
> order to original allocation.

Right, this is necessary for the new label exit structure to work.

> If you do submit this patch, please add "reviewed-by 
> skitching@vonos.net" or "tested-by" or whatever you think the 
> appropriate tag is.

I used the former, thanks for your (and Christian's) review!


> Thanks to this patch pointing me to the appropriate area, I have 
> actually found the real cause of the scratch-register leak, and will 
> post a candidate patch today.

Excellent, looking forward to it.
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 8acb34f..819c352 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -3818,7 +3818,7 @@  int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 	WREG32(scratch, 0xCAFEDEAD);
 	r = radeon_ib_get(rdev, RADEON_RING_TYPE_GFX_INDEX, &ib, 256);
 	if (r) {
-		return r;
+		goto free_scratch;
 	}
 	ib.ptr[0] = PACKET0(scratch, 0);
 	ib.ptr[1] = 0xDEADBEEF;
@@ -3831,13 +3831,11 @@  int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 	ib.length_dw = 8;
 	r = radeon_ib_schedule(rdev, &ib, NULL);
 	if (r) {
-		radeon_scratch_free(rdev, scratch);
-		radeon_ib_free(rdev, &ib);
-		return r;
+		goto free_ib;
 	}
 	r = radeon_fence_wait(ib.fence, false);
 	if (r) {
-		return r;
+		goto free_ib;
 	}
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
@@ -3853,8 +3851,10 @@  int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 			  scratch, tmp);
 		r = -EINVAL;
 	}
-	radeon_scratch_free(rdev, scratch);
+free_ib:
 	radeon_ib_free(rdev, &ib);
+free_scratch:
+	radeon_scratch_free(rdev, scratch);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index d79c639..38b546e 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -2638,7 +2638,7 @@  int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 	r = radeon_ib_get(rdev, ring->idx, &ib, 256);
 	if (r) {
 		DRM_ERROR("radeon: failed to get ib (%d).\n", r);
-		return r;
+		goto free_scratch;
 	}
 	ib.ptr[0] = PACKET3(PACKET3_SET_CONFIG_REG, 1);
 	ib.ptr[1] = ((scratch - PACKET3_SET_CONFIG_REG_OFFSET) >> 2);
@@ -2646,15 +2646,13 @@  int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 	ib.length_dw = 3;
 	r = radeon_ib_schedule(rdev, &ib, NULL);
 	if (r) {
-		radeon_scratch_free(rdev, scratch);
-		radeon_ib_free(rdev, &ib);
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
-		return r;
+		goto free_ib;
 	}
 	r = radeon_fence_wait(ib.fence, false);
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
-		return r;
+		goto free_ib;
 	}
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
@@ -2669,8 +2667,10 @@  int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 			  scratch, tmp);
 		r = -EINVAL;
 	}
-	radeon_scratch_free(rdev, scratch);
+free_ib:
 	radeon_ib_free(rdev, &ib);
+free_scratch:
+	radeon_scratch_free(rdev, scratch);
 	return r;
 }