diff mbox

[16/16] drm/radeon: implement ring saving on reset

Message ID 1341830523-30320-17-git-send-email-deathsimple@vodafone.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christian König July 9, 2012, 10:42 a.m. UTC
Try to save whatever is on the rings when
we encounter an lockup.

Signed-off-by: Christian König <deathsimple@vodafone.de>
---
 drivers/gpu/drm/radeon/radeon.h        |    4 ++
 drivers/gpu/drm/radeon/radeon_device.c |   44 ++++++++++++++++----
 drivers/gpu/drm/radeon/radeon_ring.c   |   69 ++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 8 deletions(-)

Comments

Michel Dänzer July 9, 2012, 3:06 p.m. UTC | #1
On Mon, 2012-07-09 at 12:42 +0200, Christian König wrote: 
> Try to save whatever is on the rings when
> we encounter an lockup.
> 
> Signed-off-by: Christian König <deathsimple@vodafone.de>
[...] 
> @@ -1005,20 +1010,43 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>  	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>  	radeon_suspend(rdev);
>  
> +	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +		ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
> +						   &ring_data[i]);
> +		if (ring_sizes[i]) {
> +			saved = true;
> +			dev_info(rdev->dev, "Saved %d dwords of commands "
> +				 "on ring %d.\n", ring_sizes[i], i);
> +		}
> +	}
> +
> +retry:
>  	r = radeon_asic_reset(rdev);
>  	if (!r) {
> -		dev_info(rdev->dev, "GPU reset succeed\n");
> +		dev_info(rdev->dev, "GPU reset succeed trying to resume\n");

Could fix the spelling of 'succeeded' while you're at it. :)


> 		radeon_resume(rdev);
> +	}
>  
> -		r = radeon_ib_ring_tests(rdev);
> -		if (r)
> -			DRM_ERROR("ib ring test failed (%d).\n", r);
> +	radeon_restore_bios_scratch_regs(rdev);
> +	drm_helper_resume_force_mode(rdev->ddev);
> +
> +	if (!r) {
> +		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
> +			radeon_ring_restore(rdev, &rdev->ring[i],
> +					    ring_sizes[i], ring_data[i]);
> +		}

If radeon_asic_reset fails, this leaks the memory referenced by
ring_data, doesn't it?


Also, the added functions aren't documented as mandated by the rules
Alex proposed.
Christian König July 9, 2012, 3:24 p.m. UTC | #2
On 09.07.2012 17:06, Michel Dänzer wrote:
> On Mon, 2012-07-09 at 12:42 +0200, Christian König wrote:
>> Try to save whatever is on the rings when
>> we encounter an lockup.
>>
>> Signed-off-by: Christian König <deathsimple@vodafone.de>
> [...]
>> @@ -1005,20 +1010,43 @@ int radeon_gpu_reset(struct radeon_device *rdev)
>>   	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
>>   	radeon_suspend(rdev);
>>   
>> +	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> +		ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
>> +						   &ring_data[i]);
>> +		if (ring_sizes[i]) {
>> +			saved = true;
>> +			dev_info(rdev->dev, "Saved %d dwords of commands "
>> +				 "on ring %d.\n", ring_sizes[i], i);
>> +		}
>> +	}
>> +
>> +retry:
>>   	r = radeon_asic_reset(rdev);
>>   	if (!r) {
>> -		dev_info(rdev->dev, "GPU reset succeed\n");
>> +		dev_info(rdev->dev, "GPU reset succeed trying to resume\n");
> Could fix the spelling of 'succeeded' while you're at it. :)
Akk, fixed it.
>
>
>> 		radeon_resume(rdev);
>> +	}
>>   
>> -		r = radeon_ib_ring_tests(rdev);
>> -		if (r)
>> -			DRM_ERROR("ib ring test failed (%d).\n", r);
>> +	radeon_restore_bios_scratch_regs(rdev);
>> +	drm_helper_resume_force_mode(rdev->ddev);
>> +
>> +	if (!r) {
>> +		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
>> +			radeon_ring_restore(rdev, &rdev->ring[i],
>> +					    ring_sizes[i], ring_data[i]);
>> +		}
> If radeon_asic_reset fails, this leaks the memory referenced by
> ring_data, doesn't it?
Oh yes indeed, going to fix that.

> Also, the added functions aren't documented as mandated by the rules
> Alex proposed.
>
True, also going to fix that.

Christian.
diff mbox

Patch

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 9c11be8..1265840 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -770,6 +770,10 @@  int radeon_ring_test(struct radeon_device *rdev, struct radeon_ring *cp);
 void radeon_ring_force_activity(struct radeon_device *rdev, struct radeon_ring *ring);
 void radeon_ring_lockup_update(struct radeon_ring *ring);
 bool radeon_ring_test_lockup(struct radeon_device *rdev, struct radeon_ring *ring);
+unsigned radeon_ring_backup(struct radeon_device *rdev, struct radeon_ring *ring,
+			    uint32_t **data);
+int radeon_ring_restore(struct radeon_device *rdev, struct radeon_ring *ring,
+			unsigned size, uint32_t *data);
 int radeon_ring_init(struct radeon_device *rdev, struct radeon_ring *cp,
 		     unsigned ring_size, unsigned align,
 		     unsigned rptr_offs, unsigned rptr_reg, unsigned wptr_reg,
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index bbd0971..97696e5 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -996,7 +996,12 @@  int radeon_resume_kms(struct drm_device *dev)
 
 int radeon_gpu_reset(struct radeon_device *rdev)
 {
-	int r;
+	unsigned ring_sizes[RADEON_NUM_RINGS];
+	uint32_t *ring_data[RADEON_NUM_RINGS];
+
+	bool saved = false;
+
+	int i, r;
 	int resched;
 
 	down_write(&rdev->exclusive_lock);
@@ -1005,20 +1010,43 @@  int radeon_gpu_reset(struct radeon_device *rdev)
 	resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
 	radeon_suspend(rdev);
 
+	for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+		ring_sizes[i] = radeon_ring_backup(rdev, &rdev->ring[i],
+						   &ring_data[i]);
+		if (ring_sizes[i]) {
+			saved = true;
+			dev_info(rdev->dev, "Saved %d dwords of commands "
+				 "on ring %d.\n", ring_sizes[i], i);
+		}
+	}
+
+retry:
 	r = radeon_asic_reset(rdev);
 	if (!r) {
-		dev_info(rdev->dev, "GPU reset succeed\n");
+		dev_info(rdev->dev, "GPU reset succeed trying to resume\n");
 		radeon_resume(rdev);
+	}
 
-		r = radeon_ib_ring_tests(rdev);
-		if (r)
-			DRM_ERROR("ib ring test failed (%d).\n", r);
+	radeon_restore_bios_scratch_regs(rdev);
+	drm_helper_resume_force_mode(rdev->ddev);
+
+	if (!r) {
+		for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+			radeon_ring_restore(rdev, &rdev->ring[i],
+					    ring_sizes[i], ring_data[i]);
+		}
 
-		radeon_restore_bios_scratch_regs(rdev);
-		drm_helper_resume_force_mode(rdev->ddev);
-		ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
+		r = radeon_ib_ring_tests(rdev);
+		if (r) {
+			dev_err(rdev->dev, "ib ring test failed (%d).\n", r);
+			if (saved) {
+				radeon_suspend(rdev);
+				goto retry;
+			}
+		}
 	}
 
+	ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
 	if (r) {
 		/* bad news, how to tell it to userspace ? */
 		dev_info(rdev->dev, "GPU reset failed\n");
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 994c98c..6ce51d6 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -385,6 +385,75 @@  static unsigned radeon_ring_first_valid_commit(struct radeon_ring *ring)
 	return result;
 }
 
+unsigned radeon_ring_backup(struct radeon_device *rdev, struct radeon_ring *ring,
+			    uint32_t **data)
+{
+	unsigned size, ptr, i;
+	int commit;
+
+	/* just in case lock the ring */
+	mutex_lock(&rdev->ring_lock);
+	*data = NULL;
+
+	if (ring->ring_obj == NULL) {
+		mutex_unlock(&rdev->ring_lock);
+		return 0;
+	}
+
+	/* first of all update the rptr directly from the hw */
+	ring->rptr = (RREG32(ring->rptr_reg) & ring->ptr_reg_mask)
+		 >> ring->ptr_reg_shift;
+
+	/* find the first commit not processed so far */
+	commit = radeon_ring_first_valid_commit(ring);
+	if (commit == ring->track_ptr) {
+		mutex_unlock(&rdev->ring_lock);
+		return 0;
+	}
+
+	/* calculate the number of dw on the ring */
+	ptr = ring->track_back[commit];
+	size = ring->wptr + (ring->ring_size / 4);
+	size -= ptr;
+	size &= ring->ptr_mask;
+	if (size == 0) {
+		mutex_unlock(&rdev->ring_lock);
+		return 0;
+	}
+
+	/* and then save the content of the ring */
+	*data = kmalloc(size * 4, GFP_KERNEL);
+	for (i = 0; i < size; ++i) {
+		(*data)[i] = ring->ring[ptr++];
+		ptr &= ring->ptr_mask;
+	}
+
+	mutex_unlock(&rdev->ring_lock);
+	return size;
+}
+
+int radeon_ring_restore(struct radeon_device *rdev, struct radeon_ring *ring,
+			unsigned size, uint32_t *data)
+{
+	int i, r;
+
+	if (!size || !data)
+		return 0;
+
+	/* restore the saved ring content */
+	r = radeon_ring_lock(rdev, ring, size);
+	if (r)
+		return r;
+
+	for (i = 0; i < size; ++i) {
+		radeon_ring_write(ring, data[i]);
+	}
+
+	radeon_ring_unlock_commit(rdev, ring);
+	kfree(data);
+	return 0;
+}
+
 int radeon_ring_init(struct radeon_device *rdev, struct radeon_ring *ring,
 		     unsigned ring_size, unsigned align,
 		     unsigned rptr_offs, unsigned rptr_reg, unsigned wptr_reg,